You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by bo...@apache.org on 2016/01/16 19:23:06 UTC
svn commit: r1725014 [23/28] - in /tika/branches/2.x:
tika-parser-bundles/tika-multimedia-bundle/ tika-parser-modules/
tika-parser-modules/tika-advanced-module/
tika-parser-modules/tika-advanced-parser-module/
tika-parser-modules/tika-advanced-parser-m...
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/geoinfo/GeographicInformationParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,391 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geoinfo;
+
+import org.apache.sis.internal.util.CheckedArrayList;
+import org.apache.sis.internal.util.CheckedHashSet;
+import org.apache.sis.metadata.iso.DefaultMetadata;
+import org.apache.sis.metadata.iso.DefaultMetadataScope;
+import org.apache.sis.metadata.iso.constraint.DefaultLegalConstraints;
+import org.apache.sis.metadata.iso.extent.DefaultGeographicBoundingBox;
+import org.apache.sis.metadata.iso.extent.DefaultGeographicDescription;
+import org.apache.sis.metadata.iso.identification.DefaultDataIdentification;
+import org.apache.sis.storage.DataStore;
+import org.apache.sis.storage.DataStoreException;
+import org.apache.sis.storage.DataStores;
+import org.apache.sis.storage.UnsupportedStorageException;
+import org.apache.sis.util.collection.CodeListSet;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.opengis.metadata.Identifier;
+import org.opengis.metadata.citation.Citation;
+import org.opengis.metadata.citation.CitationDate;
+import org.opengis.metadata.citation.OnlineResource;
+import org.opengis.metadata.citation.ResponsibleParty;
+import org.opengis.metadata.constraint.Restriction;
+import org.opengis.metadata.distribution.DigitalTransferOptions;
+import org.opengis.metadata.distribution.Distribution;
+import org.opengis.metadata.distribution.Distributor;
+import org.opengis.metadata.distribution.Format;
+import org.opengis.metadata.extent.Extent;
+import org.opengis.metadata.extent.GeographicExtent;
+import org.opengis.metadata.identification.Identification;
+import org.opengis.metadata.identification.Keywords;
+import org.opengis.metadata.identification.Progress;
+import org.opengis.metadata.identification.TopicCategory;
+import org.opengis.util.InternationalString;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.Charset;
+import java.util.*;
+
+
+public class GeographicInformationParser extends AbstractParser{
+
+ public static final String geoInfoType="text/iso19139+xml";
+ private final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.text("iso19139+xml"));
+
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
+ metadata.set(Metadata.CONTENT_TYPE,geoInfoType);
+ DataStore dataStore= null;
+ DefaultMetadata defaultMetadata=null;
+ XHTMLContentHandler xhtmlContentHandler=new XHTMLContentHandler(contentHandler,metadata);
+
+ try {
+ TemporaryResources tmp = new TemporaryResources();
+ TikaInputStream tikaInputStream=TikaInputStream.get(inputStream,tmp);
+ File file= tikaInputStream.getFile();
+ dataStore = DataStores.open(file);
+ defaultMetadata=new DefaultMetadata(dataStore.getMetadata());
+ if(defaultMetadata!=null)
+ extract(xhtmlContentHandler, metadata, defaultMetadata);
+
+ }catch (UnsupportedStorageException e) {
+ throw new TikaException("UnsupportedStorageException",e);
+ }
+ catch (DataStoreException e) {
+ throw new TikaException("DataStoreException",e);
+ }
+ }
+
+ private void extract(XHTMLContentHandler xhtmlContentHandler,Metadata metadata, DefaultMetadata defaultMetadata) throws SAXException{
+ try {
+ getMetaDataCharacterSet(metadata, defaultMetadata);
+ getMetaDataContact(metadata, defaultMetadata);
+ getMetaDataIdentificationInfo(metadata, defaultMetadata);
+ getMetaDataDistributionInfo(metadata, defaultMetadata);
+ getMetaDataDateInfo(metadata, defaultMetadata);
+ getMetaDataResourceScope(metadata, defaultMetadata);
+ getMetaDataParentMetaDataTitle(metadata, defaultMetadata);
+ getMetaDataIdetifierCode(metadata, defaultMetadata);
+ getMetaDataStandard(metadata, defaultMetadata);
+ extractContent(xhtmlContentHandler, defaultMetadata);
+ }
+ catch(Exception e){
+ e.printStackTrace();
+ }
+ }
+
+ private void extractContent(XHTMLContentHandler xhtmlContentHandler, DefaultMetadata defaultMetadata) throws SAXException{
+ xhtmlContentHandler.startDocument();
+ xhtmlContentHandler.newline();
+
+ xhtmlContentHandler.newline();
+ ArrayList<Identification> identifications= (ArrayList<Identification>) defaultMetadata.getIdentificationInfo();
+ for(Identification i:identifications) {
+ xhtmlContentHandler.startElement("h1");
+ xhtmlContentHandler.characters(i.getCitation().getTitle().toString());
+ xhtmlContentHandler.endElement("h1");
+ xhtmlContentHandler.newline();
+
+ ArrayList<ResponsibleParty> responsiblePartyArrayList = (ArrayList<ResponsibleParty>) i.getCitation().getCitedResponsibleParties();
+ for (ResponsibleParty r : responsiblePartyArrayList) {
+ xhtmlContentHandler.startElement("h3");
+ xhtmlContentHandler.newline();
+ xhtmlContentHandler.characters("CitedResponsiblePartyRole " + r.getRole().toString());
+ xhtmlContentHandler.characters("CitedResponsiblePartyName " + r.getIndividualName().toString());
+ xhtmlContentHandler.endElement("h3");
+ xhtmlContentHandler.newline();
+ }
+
+ xhtmlContentHandler.startElement("p");
+ xhtmlContentHandler.newline();
+ xhtmlContentHandler.characters("IdentificationInfoAbstract " + i.getAbstract().toString());
+ xhtmlContentHandler.endElement("p");
+ xhtmlContentHandler.newline();
+ Collection<Extent> extentList=((DefaultDataIdentification) i).getExtents();
+ for(Extent e:extentList){
+ ArrayList<GeographicExtent> geoElements= (ArrayList<GeographicExtent>) e.getGeographicElements();
+ for(GeographicExtent g:geoElements) {
+
+ if (g instanceof DefaultGeographicBoundingBox) {
+ xhtmlContentHandler.startElement("tr");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters("GeographicElementWestBoundLatitude");
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getWestBoundLongitude()));
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.endElement("tr");
+ xhtmlContentHandler.startElement("tr");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters("GeographicElementEastBoundLatitude");
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getEastBoundLongitude()));
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.endElement("tr");
+ xhtmlContentHandler.startElement("tr");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters("GeographicElementNorthBoundLatitude");
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getNorthBoundLatitude()));
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.endElement("tr");
+ xhtmlContentHandler.startElement("tr");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters("GeographicElementSouthBoundLatitude");
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.startElement("td");
+ xhtmlContentHandler.characters(String.valueOf(((DefaultGeographicBoundingBox) g).getSouthBoundLatitude()));
+ xhtmlContentHandler.endElement("td");
+ xhtmlContentHandler.endElement("tr");
+ }
+ }
+ }
+ }
+ xhtmlContentHandler.newline();
+ xhtmlContentHandler.endDocument();
+ }
+
+ private void getMetaDataCharacterSet(Metadata metadata, DefaultMetadata defaultMetaData){
+ CheckedHashSet<Charset> charSetList= (CheckedHashSet<Charset>) defaultMetaData.getCharacterSets();
+ for(Charset c:charSetList){
+ metadata.add("CharacterSet",c.name());
+ }
+ }
+
+
+ private void getMetaDataContact(Metadata metadata, DefaultMetadata defaultMetaData){
+ CheckedArrayList<ResponsibleParty> contactSet= (CheckedArrayList<ResponsibleParty>) defaultMetaData.getContacts();
+ for(ResponsibleParty rparty:contactSet){
+ if(rparty.getRole()!=null)
+ metadata.add("ContactRole",rparty.getRole().name());
+ if(rparty.getOrganisationName()!=null)
+ metadata.add("ContactPartyName-",rparty.getOrganisationName().toString());
+ }
+ }
+
+ private void getMetaDataIdentificationInfo(Metadata metadata, DefaultMetadata defaultMetaData){
+ ArrayList<Identification> identifications= (ArrayList<Identification>) defaultMetaData.getIdentificationInfo();
+ for(Identification i:identifications){
+ DefaultDataIdentification defaultDataIdentification= (DefaultDataIdentification) i;
+ if(i.getCitation()!=null && i.getCitation().getTitle()!=null)
+ metadata.add("IdentificationInfoCitationTitle ",i.getCitation().getTitle().toString());
+
+ ArrayList<CitationDate> dateArrayList= (ArrayList<CitationDate>) i.getCitation().getDates();
+ for (CitationDate d:dateArrayList){
+ if(d.getDateType()!=null)
+ metadata.add("CitationDate ",d.getDateType().name()+"-->"+d.getDate());
+ }
+ ArrayList<ResponsibleParty> responsiblePartyArrayList= (ArrayList<ResponsibleParty>) i.getCitation().getCitedResponsibleParties();
+ for(ResponsibleParty r:responsiblePartyArrayList){
+ if(r.getRole()!=null)
+ metadata.add("CitedResponsiblePartyRole ",r.getRole().toString());
+ if(r.getIndividualName()!=null)
+ metadata.add("CitedResponsiblePartyName ",r.getIndividualName().toString());
+ if(r.getOrganisationName()!=null)
+ metadata.add("CitedResponsiblePartyOrganizationName ", r.getOrganisationName().toString());
+ if(r.getPositionName()!=null)
+ metadata.add("CitedResponsiblePartyPositionName ",r.getPositionName().toString());
+
+ if(r.getContactInfo()!=null){
+ for(String s:r.getContactInfo().getAddress().getElectronicMailAddresses()) {
+ metadata.add("CitedResponsiblePartyEMail ",s.toString());
+ }
+ }
+ }
+ if(i.getAbstract()!=null)
+ metadata.add("IdentificationInfoAbstract ",i.getAbstract().toString());
+ for(Progress p:i.getStatus()) {
+ metadata.add("IdentificationInfoStatus ",p.name());
+ }
+ ArrayList<Format> formatArrayList= (ArrayList<Format>) i.getResourceFormats();
+ for(Format f:formatArrayList){
+ if(f.getName()!=null)
+ metadata.add("ResourceFormatSpecificationAlternativeTitle ",f.getName().toString());
+ }
+ CheckedHashSet<Locale> localeCheckedHashSet= (CheckedHashSet<Locale>) defaultDataIdentification.getLanguages();
+ for(Locale l:localeCheckedHashSet){
+ metadata.add("IdentificationInfoLanguage-->",l.getDisplayLanguage(Locale.ENGLISH));
+ }
+ CodeListSet<TopicCategory> categoryList= (CodeListSet<TopicCategory>) defaultDataIdentification.getTopicCategories();
+ for(TopicCategory t:categoryList){
+ metadata.add("IdentificationInfoTopicCategory-->",t.name());
+ }
+ ArrayList<Keywords> keywordList= (ArrayList<Keywords>) i.getDescriptiveKeywords();
+ int j=1;
+ for(Keywords k:keywordList){
+ j++;
+ ArrayList<InternationalString> stringList= (ArrayList<InternationalString>) k.getKeywords();
+ for(InternationalString s:stringList){
+ metadata.add("Keywords "+j ,s.toString());
+ }
+ if(k.getType()!=null)
+ metadata.add("KeywordsType "+j,k.getType().name());
+ if(k.getThesaurusName()!=null && k.getThesaurusName().getTitle()!=null)
+ metadata.add("ThesaurusNameTitle "+j,k.getThesaurusName().getTitle().toString());
+ if(k.getThesaurusName()!=null && k.getThesaurusName().getAlternateTitles()!=null)
+ metadata.add("ThesaurusNameAlternativeTitle "+j,k.getThesaurusName().getAlternateTitles().toString());
+
+ ArrayList<CitationDate>citationDates= (ArrayList<CitationDate>) k.getThesaurusName().getDates();
+ for(CitationDate cd:citationDates) {
+ if(cd.getDateType()!=null)
+ metadata.add("ThesaurusNameDate ",cd.getDateType().name() +"-->" + cd.getDate());
+ }
+ }
+ ArrayList<DefaultLegalConstraints> constraintList= (ArrayList<DefaultLegalConstraints>) i.getResourceConstraints();
+
+ for(DefaultLegalConstraints c:constraintList){
+ for(Restriction r:c.getAccessConstraints()){
+ metadata.add("AccessContraints ",r.name());
+ }
+ for(InternationalString s:c.getOtherConstraints()){
+ metadata.add("OtherConstraints ",s.toString());
+ }
+ for(Restriction r:c.getUseConstraints()) {
+ metadata.add("UserConstraints ",r.name());
+ }
+
+ }
+ Collection<Extent> extentList=((DefaultDataIdentification) i).getExtents();
+ for(Extent e:extentList){
+ ArrayList<GeographicExtent> geoElements= (ArrayList<GeographicExtent>) e.getGeographicElements();
+ for(GeographicExtent g:geoElements){
+
+ if(g instanceof DefaultGeographicDescription){
+ if(((DefaultGeographicDescription) g).getGeographicIdentifier()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getCode()!=null )
+ metadata.add("GeographicIdentifierCode ",((DefaultGeographicDescription) g).getGeographicIdentifier().getCode().toString());
+ if(((DefaultGeographicDescription) g).getGeographicIdentifier()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority()!=null && ((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getTitle()!=null )
+ metadata.add("GeographicIdentifierAuthorityTitle ",((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getTitle().toString());
+
+ for(InternationalString s:((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getAlternateTitles()) {
+ metadata.add("GeographicIdentifierAuthorityAlternativeTitle ",s.toString());
+ }
+ for(CitationDate cd:((DefaultGeographicDescription) g).getGeographicIdentifier().getAuthority().getDates()){
+ if(cd.getDateType()!=null && cd.getDate()!=null)
+ metadata.add("GeographicIdentifierAuthorityDate ",cd.getDateType().name()+" "+cd.getDate().toString());
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private void getMetaDataDistributionInfo(Metadata metadata, DefaultMetadata defaultMetaData){
+ Distribution distribution=defaultMetaData.getDistributionInfo();
+ ArrayList<Format> distributionFormat= (ArrayList<Format>) distribution.getDistributionFormats();
+ for(Format f:distributionFormat){
+ if(f.getName()!=null)
+ metadata.add("DistributionFormatSpecificationAlternativeTitle ",f.getName().toString());
+ }
+ ArrayList<Distributor> distributorList= (ArrayList<Distributor>) distribution.getDistributors();
+ for(Distributor d:distributorList){
+ if(d!=null && d.getDistributorContact()!=null && d.getDistributorContact().getRole()!=null)
+ metadata.add("Distributor Contact ",d.getDistributorContact().getRole().name());
+ if(d!=null && d.getDistributorContact()!=null && d.getDistributorContact().getOrganisationName()!=null)
+ metadata.add("Distributor Organization Name ",d.getDistributorContact().getOrganisationName().toString());
+ }
+ ArrayList<DigitalTransferOptions> transferOptionsList= (ArrayList<DigitalTransferOptions>) distribution.getTransferOptions();
+ for(DigitalTransferOptions d:transferOptionsList){
+ ArrayList<OnlineResource> onlineResourceList= (ArrayList<OnlineResource>) d.getOnLines();
+ for(OnlineResource or:onlineResourceList){
+ if(or.getLinkage()!=null)
+ metadata.add("TransferOptionsOnlineLinkage ",or.getLinkage().toString());
+ if(or.getProtocol()!=null)
+ metadata.add("TransferOptionsOnlineProtocol ",or.getProtocol());
+ if(or.getApplicationProfile()!=null)
+ metadata.add("TransferOptionsOnlineProfile ",or.getApplicationProfile());
+ if(or.getName()!=null)
+ metadata.add("TransferOptionsOnlineName ",or.getName());
+ if(or.getDescription()!=null)
+ metadata.add("TransferOptionsOnlineDescription ",or.getDescription().toString());
+ if(or.getFunction()!=null)
+ metadata.add("TransferOptionsOnlineFunction ",or.getFunction().name());
+
+ }
+ }
+ }
+
+ private void getMetaDataDateInfo(Metadata metadata, DefaultMetadata defaultMetaData){
+ ArrayList<CitationDate> citationDateList= (ArrayList<CitationDate>) defaultMetaData.getDateInfo();
+ for(CitationDate c:citationDateList){
+ if(c.getDateType()!=null)
+ metadata.add("DateInfo ",c.getDateType().name()+" "+c.getDate());
+ }
+ }
+
+ private void getMetaDataResourceScope(Metadata metadata, DefaultMetadata defaultMetaData){
+ ArrayList<DefaultMetadataScope> scopeList= (ArrayList<DefaultMetadataScope>) defaultMetaData.getMetadataScopes();
+ for(DefaultMetadataScope d:scopeList){
+ if(d.getResourceScope()!=null)
+ metadata.add("MetaDataResourceScope ",d.getResourceScope().name());
+ }
+ }
+
+ private void getMetaDataParentMetaDataTitle(Metadata metadata, DefaultMetadata defaultMetaData){
+ Citation parentMetaData=defaultMetaData.getParentMetadata();
+ if(parentMetaData!=null && parentMetaData.getTitle()!=null)
+ metadata.add("ParentMetaDataTitle",parentMetaData.getTitle().toString());
+ }
+
+ private void getMetaDataIdetifierCode(Metadata metadata, DefaultMetadata defaultMetaData){
+ Identifier identifier= defaultMetaData.getMetadataIdentifier();
+ if(identifier!=null)
+ metadata.add("MetaDataIdentifierCode",identifier.getCode());
+ }
+
+ private void getMetaDataStandard(Metadata metadata, DefaultMetadata defaultMetaData){
+ ArrayList<Citation> citationList= (ArrayList<Citation>) defaultMetaData.getMetadataStandards();
+ for(Citation c:citationList){
+ if(c.getTitle()!=null)
+ metadata.add("MetaDataStandardTitle ",c.getTitle().toString());
+ if(c.getEdition()!=null)
+ metadata.add("MetaDataStandardEdition ",c.getEdition().toString());
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/grib/GribParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.grib;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.File;
+import java.util.Collections;
+import java.util.Set;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import ucar.nc2.Attribute;
+import ucar.nc2.Dimension;
+import ucar.nc2.NetcdfFile;
+import ucar.nc2.Variable;
+import ucar.nc2.dataset.NetcdfDataset;
+
+public class GribParser extends AbstractParser {
+
+ private static final long serialVersionUID = 7855458954474247655L;
+
+ public static final String GRIB_MIME_TYPE = "application/x-grib2";
+
+ private final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-grib2"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ //Set MIME type as grib2
+ metadata.set(Metadata.CONTENT_TYPE, GRIB_MIME_TYPE);
+
+ TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ File gribFile = tis.getFile();
+
+ try {
+ NetcdfFile ncFile = NetcdfDataset.openFile(gribFile.getAbsolutePath(), null);
+
+ // first parse out the set of global attributes
+ for (Attribute attr : ncFile.getGlobalAttributes()) {
+ Property property = resolveMetadataKey(attr.getFullName());
+ if (attr.getDataType().isString()) {
+ metadata.add(property, attr.getStringValue());
+ } else if (attr.getDataType().isNumeric()) {
+ int value = attr.getNumericValue().intValue();
+ metadata.add(property, String.valueOf(value));
+ }
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ xhtml.startDocument();
+
+ xhtml.newline();
+ xhtml.startElement("ul");
+ xhtml.characters("dimensions:");
+ xhtml.newline();
+
+ for (Dimension dim : ncFile.getDimensions()){
+ xhtml.element("li", dim.getFullName() + "=" + String.valueOf(dim.getLength()) + ";");
+ xhtml.newline();
+ }
+
+ xhtml.startElement("ul");
+ xhtml.characters("variables:");
+ xhtml.newline();
+
+ for (Variable var : ncFile.getVariables()){
+ xhtml.element("p", String.valueOf(var.getDataType()) + var.getNameAndDimensions() + ";");
+ for(Attribute element : var.getAttributes()){
+ xhtml.element("li", " :" + element + ";");
+ xhtml.newline();
+ }
+ }
+ xhtml.endElement("ul");
+ xhtml.endElement("ul");
+ xhtml.endDocument();
+
+ } catch (IOException e) {
+ throw new TikaException("NetCDF parse error", e);
+ }
+ }
+
+ private Property resolveMetadataKey(String localName) {
+ if ("title".equals(localName)) {
+ return TikaCoreProperties.TITLE;
+ }
+ return Property.internalText(localName);
+ }
+
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/hdf/HDFParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.hdf;
+
+//JDK imports
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.netcdf.NetCDFParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import ucar.nc2.Attribute;
+import ucar.nc2.Group;
+import ucar.nc2.NetcdfFile;
+
+/**
+ *
+ * Since the {@link NetCDFParser} depends on the <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java" >NetCDF-Java</a> API,
+ * we are able to use it to parse HDF files as well. See <a href=
+ * "http://www.unidata.ucar.edu/software/netcdf-java/formats/FileTypes.html"
+ * >this link</a> for more information.
+ */
+public class HDFParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = 1091208208003437549L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-hdf"));
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.netcdf.NetCDFParser#getSupportedTypes(org.apache
+ * .tika.parser.ParseContext)
+ */
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.netcdf.NetCDFParser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+ ByteArrayOutputStream os = new ByteArrayOutputStream();
+ IOUtils.copy(stream, os);
+
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (name == null) {
+ name = "";
+ }
+ try {
+ NetcdfFile ncFile = NetcdfFile.openInMemory(name, os.toByteArray());
+ unravelStringMet(ncFile, null, metadata);
+ } catch (IOException e) {
+ throw new TikaException("HDF parse error", e);
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.endDocument();
+ }
+
+ protected void unravelStringMet(NetcdfFile ncFile, Group group, Metadata met) {
+ if (group == null) {
+ group = ncFile.getRootGroup();
+ }
+
+ // get file type
+ met.set("File-Type-Description", ncFile.getFileTypeDescription());
+ // unravel its string attrs
+ for (Attribute attribute : group.getAttributes()) {
+ if (attribute.isString()) {
+ met.add(attribute.getFullName(), attribute.getStringValue());
+ } else {
+ // try and cast its value to a string
+ met.add(attribute.getFullName(), String.valueOf(attribute
+ .getNumericValue()));
+ }
+ }
+
+ for (Group g : group.getGroups()) {
+ unravelStringMet(ncFile, g, met);
+ }
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISATabUtils.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,209 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.isatab;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.commons.csv.CSVFormat;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.SAXException;
+
+public class ISATabUtils {
+
+ private static final ServiceLoader LOADER = new ServiceLoader(ISATabUtils.class.getClassLoader());
+
+ /**
+ * INVESTIGATION
+ */
+
+ // Investigation section.
+ private static final String[] sections = {
+ "ONTOLOGY SOURCE REFERENCE",
+ "INVESTIGATION",
+ "INVESTIGATION PUBLICATIONS",
+ "INVESTIGATION CONTACTS"
+ };
+
+ // STUDY section (inside the Study section)
+ private static final String studySectionField = "STUDY";
+
+ // Study File Name (inside the STUDY section)
+ private static final String studyFileNameField = "Study File Name";
+
+ public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context, String studyFileName) throws IOException, TikaException, SAXException {
+ // Automatically detect the character encoding
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(stream),
+ metadata, context.get(ServiceLoader.class, LOADER))) {
+ extractMetadata(reader, metadata, studyFileName);
+ }
+ }
+
+ public static void parseInvestigation(InputStream stream, XHTMLContentHandler handler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ parseInvestigation(stream, handler, metadata, context, null);
+ }
+
+ public static void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ TikaInputStream tis = TikaInputStream.get(stream);
+ // Automatically detect the character encoding
+
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis),
+ metadata, context.get(ServiceLoader.class, LOADER));
+ CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
+ Iterator<CSVRecord> iterator = csvParser.iterator();
+
+ xhtml.startElement("table");
+
+ xhtml.startElement("thead");
+ if (iterator.hasNext()) {
+ CSVRecord record = iterator.next();
+ for (int i = 0; i < record.size(); i++) {
+ xhtml.startElement("th");
+ xhtml.characters(record.get(i));
+ xhtml.endElement("th");
+ }
+ }
+ xhtml.endElement("thead");
+
+ xhtml.startElement("tbody");
+ while (iterator.hasNext()) {
+ CSVRecord record = iterator.next();
+ xhtml.startElement("tr");
+ for (int j = 0; j < record.size(); j++) {
+ xhtml.startElement("td");
+ xhtml.characters(record.get(j));
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("tbody");
+
+ xhtml.endElement("table");
+ }
+ }
+
+ public static void parseAssay(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException {
+ TikaInputStream tis = TikaInputStream.get(stream);
+
+ // Automatically detect the character encoding
+
+ try (AutoDetectReader reader = new AutoDetectReader(new CloseShieldInputStream(tis),
+ metadata, context.get(ServiceLoader.class, LOADER));
+ CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
+ xhtml.startElement("table");
+
+ Iterator<CSVRecord> iterator = csvParser.iterator();
+
+ xhtml.startElement("thead");
+ if (iterator.hasNext()) {
+ CSVRecord record = iterator.next();
+ for (int i = 0; i < record.size(); i++) {
+ xhtml.startElement("th");
+ xhtml.characters(record.get(i));
+ xhtml.endElement("th");
+ }
+ }
+ xhtml.endElement("thead");
+
+ xhtml.startElement("tbody");
+ while (iterator.hasNext()) {
+ CSVRecord record = iterator.next();
+ xhtml.startElement("tr");
+ for (int j = 0; j < record.size(); j++) {
+ xhtml.startElement("td");
+ xhtml.characters(record.get(j));
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("tbody");
+
+ xhtml.endElement("table");
+ }
+ }
+
+ private static void extractMetadata(Reader reader, Metadata metadata, String studyFileName) throws IOException {
+ boolean investigationSection = false;
+ boolean studySection = false;
+ boolean studyTarget = false;
+
+ Map<String, String> map = new HashMap<String, String>();
+
+ try (CSVParser csvParser = new CSVParser(reader, CSVFormat.TDF)) {
+ Iterator<CSVRecord> iterator = csvParser.iterator();
+
+ while (iterator.hasNext()) {
+ CSVRecord record = iterator.next();
+ String field = record.get(0);
+ if ((field.toUpperCase(Locale.ENGLISH).equals(field)) && (record.size() == 1)) {
+ investigationSection = Arrays.asList(sections).contains(field);
+ studySection = (studyFileName != null) && (field.equals(studySectionField));
+ } else {
+ if (investigationSection) {
+ addMetadata(field, record, metadata);
+ } else if (studySection) {
+ if (studyTarget) {
+ break;
+ }
+ String value = record.get(1);
+ map.put(field, value);
+ studyTarget = (field.equals(studyFileNameField)) && (value.equals(studyFileName));
+ if (studyTarget) {
+ mapStudyToMetadata(map, metadata);
+ studySection = false;
+ }
+ } else if (studyTarget) {
+ addMetadata(field, record, metadata);
+ }
+ }
+ }
+ } catch (IOException ioe) {
+ throw ioe;
+ }
+ }
+
+ private static void addMetadata(String field, CSVRecord record, Metadata metadata) {
+ if ((record ==null) || (record.size() <= 1)) {
+ return;
+ }
+
+ for (int i = 1; i < record.size(); i++) {
+ metadata.add(field, record.get(i));
+ }
+ }
+
+ private static void mapStudyToMetadata(Map<String, String> map, Metadata metadata) {
+ for (Map.Entry<String, String> entry : map.entrySet()) {
+ metadata.add(entry.getKey(), entry.getValue());
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/isatab/ISArchiveParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.isatab;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class ISArchiveParser implements Parser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = 3640809327541300229L;
+
+ private final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-isatab"));
+
+ private static String studyAssayFileNameField = "Study Assay File Name";
+
+ private String location = null;
+
+ private String studyFileName = null;
+
+ /**
+ * Default constructor.
+ */
+ public ISArchiveParser() {
+ this(null);
+ }
+
+ /**
+ * Constructor that accepts the pathname of ISArchive folder.
+ * @param location pathname of ISArchive folder including ISA-Tab files
+ */
+ public ISArchiveParser(String location) {
+ if (location != null && !location.endsWith(File.separator)) {
+ location += File.separator;
+ }
+ this.location = location;
+ }
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
+ ParseContext context) throws IOException, SAXException, TikaException {
+
+ TikaInputStream tis = TikaInputStream.get(stream);
+ if (this.location == null) {
+ this.location = tis.getFile().getParent() + File.separator;
+ }
+ this.studyFileName = tis.getFile().getName();
+
+ File locationFile = new File(location);
+ String[] investigationList = locationFile.list(new FilenameFilter() {
+
+ @Override
+ public boolean accept(File dir, String name) {
+ return name.matches("i_.+\\.txt");
+ }
+ });
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ parseInvestigation(investigationList, xhtml, metadata, context);
+ parseStudy(stream, xhtml, metadata, context);
+ parseAssay(xhtml, metadata, context);
+
+ xhtml.endDocument();
+ }
+
+ private void parseInvestigation(String[] investigationList, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ if ((investigationList == null) || (investigationList.length == 0)) {
+ // TODO warning
+ return;
+ }
+ if (investigationList.length > 1) {
+ // TODO warning
+ return;
+ }
+
+ String investigation = investigationList[0]; // TODO add to metadata?
+ InputStream stream = TikaInputStream.get(new File(this.location + investigation));
+
+ ISATabUtils.parseInvestigation(stream, xhtml, metadata, context, this.studyFileName);
+
+ xhtml.element("h1", "INVESTIGATION " + metadata.get("Investigation Identifier"));
+ }
+
+ private void parseStudy(InputStream stream, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ xhtml.element("h2", "STUDY " + metadata.get("Study Identifier"));
+
+ ISATabUtils.parseStudy(stream, xhtml, metadata, context);
+ }
+
+ private void parseAssay(XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+ for (String assayFileName : metadata.getValues(studyAssayFileNameField)) {
+ xhtml.startElement("div");
+ xhtml.element("h3", "ASSAY " + assayFileName);
+ InputStream stream = TikaInputStream.get(new File(this.location + assayFileName));
+ ISATabUtils.parseAssay(stream, xhtml, metadata, context);
+ xhtml.endElement("div");
+ }
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/mat/MatParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.mat;
+
+//JDK imports
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.util.Map;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+//JMatIO imports
+import com.jmatio.io.MatFileHeader;
+import com.jmatio.io.MatFileReader;
+import com.jmatio.types.MLArray;
+import com.jmatio.types.MLStructure;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+
+public class MatParser extends AbstractParser {
+
+ public static final String MATLAB_MIME_TYPE =
+ "application/x-matlab-data";
+
+ private final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-matlab-data"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context){
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ //Set MIME type as Matlab
+ metadata.set(Metadata.CONTENT_TYPE, MATLAB_MIME_TYPE);
+
+ try {
+ // Use TIS so we can spool a temp file for parsing.
+ TikaInputStream tis = TikaInputStream.get(stream);
+
+ //Extract information from header file
+ MatFileReader mfr = new MatFileReader(tis.getFile()); //input .mat file
+ MatFileHeader hdr = mfr.getMatFileHeader(); //.mat header information
+
+ // Example header: "MATLAB 5.0 MAT-file, Platform: MACI64, Created on: Sun Mar 2 23:41:57 2014"
+ String[] parts = hdr.getDescription().split(","); // Break header information into its parts
+
+ if (parts[2].contains("Created")) {
+ int lastIndex1 = parts[2].lastIndexOf("Created on:");
+ String dateCreated = parts[2].substring(lastIndex1 + "Created on:".length()).trim();
+ metadata.set("createdOn", dateCreated);
+ }
+
+ if (parts[1].contains("Platform")) {
+ int lastIndex2 = parts[1].lastIndexOf("Platform:");
+ String platform = parts[1].substring(lastIndex2 + "Platform:".length()).trim();
+ metadata.set("platform" , platform);
+ }
+
+ if (parts[0].contains("MATLAB")) {
+ metadata.set("fileType", parts[0]);
+ }
+
+ // Get endian indicator from header file
+ String endianBytes = new String(hdr.getEndianIndicator(), UTF_8); // Retrieve endian bytes and convert to string
+ String endianCode = String.valueOf(endianBytes.toCharArray()); // Convert bytes to characters to string
+ metadata.set("endian", endianCode);
+
+ //Text output
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.newline();
+ //Loop through each variable
+ for (Map.Entry<String, MLArray> entry : mfr.getContent().entrySet()) {
+ String varName = entry.getKey();
+ MLArray varData = entry.getValue();
+
+ xhtml.element("p", varName + ":" + String.valueOf(varData));
+
+ // If the variable is a structure, extract variable info from structure
+ if (varData.isStruct()){
+ MLStructure mlStructure = (MLStructure) mfr.getMLArray(varName);
+ xhtml.startElement("ul");
+ xhtml.newline();
+ for (MLArray element : mlStructure.getAllFields()){
+ xhtml.startElement("li");
+ xhtml.characters(String.valueOf(element));
+
+ // If there is an embedded structure, extract variable info.
+ if (element.isStruct()){
+ xhtml.startElement("ul");
+ // Should this actually be a recursive call?
+ xhtml.element("li", element.contentToString());
+ xhtml.endElement("ul");
+ }
+
+ xhtml.endElement("li");
+ }
+ xhtml.endElement("ul");
+ }
+ }
+ xhtml.endDocument();
+ } catch (IOException e) {
+ throw new TikaException("Error parsing Matlab file with MatParser", e);
+ }
+ }
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/netcdf/NetCDFParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.netcdf;
+
+//JDK imports
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+import java.util.List;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import ucar.nc2.Attribute;
+import ucar.nc2.NetcdfFile;
+import ucar.nc2.Variable;
+import ucar.nc2.Dimension;
+
+/**
+ * A {@link Parser} for <a
+ * href="http://www.unidata.ucar.edu/software/netcdf/index.html">NetCDF</a>
+ * files using the UCAR, MIT-licensed <a
+ * href="http://www.unidata.ucar.edu/software/netcdf-java/">NetCDF for Java</a>
+ * API.
+ */
+public class NetCDFParser extends AbstractParser {
+
+ /**
+ * Serial version UID
+ */
+ private static final long serialVersionUID = -5940938274907708665L;
+
+ private final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("x-netcdf"));
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see
+ * org.apache.tika.parser.Parser#getSupportedTypes(org.apache.tika.parser
+ * .ParseContext)
+ */
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.tika.parser.Parser#parse(java.io.InputStream,
+ * org.xml.sax.ContentHandler, org.apache.tika.metadata.Metadata,
+ * org.apache.tika.parser.ParseContext)
+ */
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ TikaInputStream tis = TikaInputStream.get(stream, new TemporaryResources());
+ try {
+ NetcdfFile ncFile = NetcdfFile.open(tis.getFile().getAbsolutePath());
+ metadata.set("File-Type-Description", ncFile.getFileTypeDescription());
+ // first parse out the set of global attributes
+ for (Attribute attr : ncFile.getGlobalAttributes()) {
+ Property property = resolveMetadataKey(attr.getFullName());
+ if (attr.getDataType().isString()) {
+ metadata.add(property, attr.getStringValue());
+ } else if (attr.getDataType().isNumeric()) {
+ int value = attr.getNumericValue().intValue();
+ metadata.add(property, String.valueOf(value));
+ }
+ }
+
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+ xhtml.newline();
+ xhtml.element("h1", "dimensions");
+ xhtml.startElement("ul");
+ xhtml.newline();
+ for (Dimension dim : ncFile.getDimensions()) {
+ xhtml.element("li", dim.getFullName() + " = " + dim.getLength());
+ }
+ xhtml.endElement("ul");
+
+ xhtml.element("h1", "variables");
+ xhtml.startElement("ul");
+ xhtml.newline();
+ for (Variable var : ncFile.getVariables()) {
+ xhtml.startElement("li");
+ xhtml.characters(var.getDataType() + " " + var.getNameAndDimensions());
+ xhtml.newline();
+ List<Attribute> attributes = var.getAttributes();
+ if (!attributes.isEmpty()) {
+ xhtml.startElement("ul");
+ for (Attribute element : attributes) {
+ xhtml.element("li", element.toString());
+ }
+ xhtml.endElement("ul");
+ }
+ xhtml.endElement("li");
+ }
+ xhtml.endElement("ul");
+
+ xhtml.endDocument();
+
+ } catch (IOException e) {
+ throw new TikaException("NetCDF parse error", e);
+ }
+ }
+
+ private Property resolveMetadataKey(String localName) {
+ if ("title".equals(localName)) {
+ return TikaCoreProperties.TITLE;
+ }
+ return Property.internalText(localName);
+ }
+}
\ No newline at end of file
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/java/org/apache/tika/parser/pot/PooledTimeSeriesParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.pot;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TemporaryResources;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.BufferedReader;
+import java.util.logging.Logger;
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.xml.sax.helpers.AttributesImpl;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+
+public class PooledTimeSeriesParser extends AbstractParser {
+
+ private static final long serialVersionUID = -2855917932512164988L;
+ private static final Set<MediaType> SUPPORTED_TYPES = Collections
+ .unmodifiableSet(new HashSet<MediaType>(Arrays.asList(new MediaType[] {
+ MediaType.video("avi"), MediaType.video("mp4")
+ // TODO: Add all supported video types
+ })));
+
+ private static final Logger LOG = Logger.getLogger(PooledTimeSeriesParser.class.getName());
+
+ public boolean isAvailable() {
+ return ExternalParser.check(
+ new String[] { "pooled-time-series", "--help" }, -1);
+ }
+
+ /**
+ * Returns the set of media types supported by this parser when used with the
+ * given parse context.
+ *
+ * @param context
+ * parse context
+ * @return immutable set of media types
+ * @since Apache Tika 0.7
+ */
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ /**
+ * Parses a document stream into a sequence of XHTML SAX events. Fills in
+ * related document metadata in the given metadata object.
+ * <p>
+ * The given document stream is consumed but not closed by this method. The
+ * responsibility to close the stream remains on the caller.
+ * <p>
+ * Information about the parsing context can be passed in the context
+ * parameter. See the parser implementations for the kinds of context
+ * information they expect.
+ *
+ * @param stream
+ * the document stream (input)
+ * @param handler
+ * handler for the XHTML SAX events (output)
+ * @param metadata
+ * document metadata (input and output)
+ * @param context
+ * parse context
+ * @throws IOException
+ * if the document stream could not be read
+ * @throws SAXException
+ * if the SAX events could not be processed
+ * @throws TikaException
+ * if the document could not be parsed
+ * @since Apache Tika 0.5
+ */
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ if (!isAvailable()) {
+ LOG.warning(
+ "PooledTimeSeries not installed!");
+ return;
+ }
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+
+ TemporaryResources tmp = new TemporaryResources();
+ File output = null;
+ try {
+ TikaInputStream tikaStream = TikaInputStream.get(stream, tmp);
+ File input = tikaStream.getFile();
+ String cmdOutput = computePoT(input);
+ FileInputStream ofStream = new FileInputStream(new File(
+ input.getAbsoluteFile() + ".of.txt"));
+ FileInputStream ogStream = new FileInputStream(new File(
+ input.getAbsoluteFile() + ".hog.txt"));
+ extractHeaderOutput(ofStream, metadata, "of");
+ extractHeaderOutput(ogStream, metadata, "og");
+ xhtml.startDocument();
+ doExtract(ofStream, xhtml, "Histogram of Optical Flows (HOF)",
+ metadata.get("of_frames"), metadata.get("of_vecSize"));
+ doExtract(ogStream, xhtml, "Histogram of Oriented Gradients (HOG)",
+ metadata.get("og_frames"), metadata.get("og_vecSize"));
+ xhtml.endDocument();
+
+ } finally {
+ tmp.dispose();
+ if (output != null) {
+ output.delete();
+ }
+ }
+ }
+
+ private String computePoT(File input)
+ throws IOException, TikaException {
+
+ CommandLine cmdLine = new CommandLine("pooled-time-series");
+ ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+ cmdLine.addArgument("-f");
+ cmdLine.addArgument(input.getAbsolutePath());
+ LOG.fine("Executing: " + cmdLine);
+ DefaultExecutor exec = new DefaultExecutor();
+ exec.setExitValue(0);
+ ExecuteWatchdog watchdog = new ExecuteWatchdog(60000);
+ exec.setWatchdog(watchdog);
+ PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+ exec.setStreamHandler(streamHandler);
+ int exitValue = exec
+ .execute(cmdLine, EnvironmentUtils.getProcEnvironment());
+ return outputStream.toString("UTF-8");
+
+ }
+
+ /**
+ * Reads the contents of the given stream and write it to the given XHTML
+ * content handler. The stream is closed once fully processed.
+ *
+ * @param stream
+ * Stream where is the result of ocr
+ * @param xhtml
+ * XHTML content handler
+ * @param tableTitle
+ * The name of the matrix/table to display.
+ * @param frames
+ * Number of frames read from the video.
+ * @param vecSize
+ * Size of the OF or HOG vector.
+ * @throws SAXException
+ * if the XHTML SAX events could not be handled
+ * @throws IOException
+ * if an input error occurred
+ */
+ private void doExtract(InputStream stream, XHTMLContentHandler xhtml,
+ String tableTitle, String frames, String vecSize) throws SAXException,
+ IOException {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(stream,
+ UTF_8));
+ String line = null;
+ AttributesImpl attributes = new AttributesImpl();
+ attributes.addAttribute("", "", "rows", "CDATA", frames);
+ attributes.addAttribute("", "", "cols", "CDATA", vecSize);
+
+ xhtml.startElement("h3");
+ xhtml.characters(tableTitle);
+ xhtml.endElement("h3");
+ xhtml.startElement("table", attributes);
+ while ((line = reader.readLine()) != null) {
+ xhtml.startElement("tr");
+ for (String val : line.split(" ")) {
+ xhtml.startElement("td");
+ xhtml.characters(val);
+ xhtml.endElement("td");
+ }
+ xhtml.endElement("tr");
+ }
+ xhtml.endElement("table");
+ }
+
+ private void extractHeaderOutput(InputStream stream, Metadata metadata,
+ String prefix) throws IOException {
+ BufferedReader reader = new BufferedReader(new InputStreamReader(stream,
+ UTF_8));
+ String line = reader.readLine();
+ String[] firstLine = line.split(" ");
+ String frames = firstLine[0];
+ String vecSize = firstLine[1];
+
+ if (prefix == null) {
+ prefix = "";
+ }
+ metadata.add(prefix + "_frames", frames);
+ metadata.add(prefix + "_vecSize", vecSize);
+ }
+
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser Sat Jan 16 18:23:01 2016
@@ -0,0 +1,28 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+#org.apache.tika.parser.ctakes.CTAKESParser
+org.apache.tika.parser.dif.DIFParser
+org.apache.tika.parser.gdal.GDALParser
+org.apache.tika.parser.geo.topic.GeoParser
+org.apache.tika.parser.geoinfo.GeographicInformationParser
+org.apache.tika.parser.grib.GribParser
+org.apache.tika.parser.hdf.HDFParser
+org.apache.tika.parser.isatab.ISArchiveParser
+org.apache.tika.parser.mat.MatParser
+org.apache.tika.parser.netcdf.NetCDFParser
+org.apache.tika.parser.pot.PooledTimeSeriesParser
+#org.apache.tika.parser.envi.EnviHeaderParser
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/dif/DIFParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dif;
+
+import org.apache.tika.TikaTest;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+
+import java.io.InputStream;
+
+import static org.junit.Assert.assertEquals;
+
+public class DIFParserTest extends TikaTest {
+
+ @Test
+ public void testDifMetadata() throws Exception {
+ Parser parser = new DIFParser();
+ ContentHandler handler = new BodyContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = DIFParser.class.getResourceAsStream(
+ "/test-documents/Zamora2010.dif")) {
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ assertEquals(metadata.get("DIF-Entry_ID"),"00794186-48f9-11e3-9dcb-00c0f03d5b7c");
+ assertEquals(metadata.get("DIF-Metadata_Name"),"ACADIS IDN DIF");
+
+ String content = handler.toString();
+ assertContains("Title: Zamora 2010 Using Sediment Geochemistry", content);
+ assertContains("Southernmost_Latitude : 78.833", content);
+ assertContains("Northernmost_Latitude : 79.016", content);
+ assertContains("Westernmost_Longitude : 11.64", content);
+ assertContains("Easternmost_Longitude : 13.34", content);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/envi/EnviHeaderParserTest.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.envi;
+
+import static org.apache.tika.TikaTest.assertContains;
+import static org.junit.Assert.assertNotNull;
+
+import java.io.InputStream;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.junit.Test;
+
+/**
+ * Test cases to exercise the {@link EnviHeaderParser}.
+ */
+public class EnviHeaderParserTest {
+ @Test
+ public void testParseGlobalMetadata() throws Exception {
+ if (System.getProperty("java.version").startsWith("1.5")) {
+ return;
+ }
+
+ Parser parser = new EnviHeaderParser();
+ ToXMLContentHandler handler = new ToXMLContentHandler();
+ Metadata metadata = new Metadata();
+
+ try (InputStream stream = EnviHeaderParser.class.getResourceAsStream(
+ "/test-documents/envi_test_header.hdr")) {
+ assertNotNull("Test ENVI file not found", stream);
+ parser.parse(stream, handler, metadata, new ParseContext());
+ }
+
+ // Check content of test file
+ String content = handler.toString();
+ assertContains("<body><p>ENVI</p>", content);
+ assertContains("<p>samples = 2400</p>", content);
+ assertContains("<p>lines = 2400</p>", content);
+ assertContains("<p>map info = {Sinusoidal, 1.5000, 1.5000, -10007091.3643, 5559289.2856, 4.6331271653e+02, 4.6331271653e+02, , units=Meters}</p>", content);
+ assertContains("content=\"application/envi.hdr\"", content);
+ assertContains("projection info = {16, 6371007.2, 0.000000, 0.0, 0.0, Sinusoidal, units=Meters}", content);
+ }
+}
Added: tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java
URL: http://svn.apache.org/viewvc/tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java?rev=1725014&view=auto
==============================================================================
--- tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java (added)
+++ tika/branches/2.x/tika-parser-modules/tika-scientific-parser-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java Sat Jan 16 18:23:01 2016
@@ -0,0 +1,181 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.gdal;
+
+//JDK imports
+
+import java.io.IOException;
+import java.io.InputStream;
+
+
+//Tika imports
+import org.apache.tika.TikaTest;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.sax.BodyContentHandler;
+
+//Junit imports
+import org.junit.Test;
+import org.xml.sax.SAXException;
+
+import static org.junit.Assert.fail;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assume.assumeTrue;
+
+/**
+ * Test harness for the GDAL parser.
+ */
+public class TestGDALParser extends TikaTest {
+
+ private boolean canRun() {
+ String[] checkCmd = {"gdalinfo"};
+ // If GDAL is not on the path, do not run the test.
+ return ExternalParser.check(checkCmd);
+ }
+
+ @Test
+ public void testParseBasicInfo() {
+ assumeTrue(canRun());
+ final String expectedDriver = "netCDF/Network Common Data Format";
+ final String expectedUpperRight = "512.0, 0.0";
+ final String expectedUpperLeft = "0.0, 0.0";
+ final String expectedLowerLeft = "0.0, 512.0";
+ final String expectedLowerRight = "512.0, 512.0";
+ final String expectedCoordinateSystem = "`'";
+ final String expectedSize = "512, 512";
+
+ GDALParser parser = new GDALParser();
+ InputStream stream = TestGDALParser.class
+ .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
+ Metadata met = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+ try {
+ parser.parse(stream, handler, met, new ParseContext());
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+
+ assertNotNull(met);
+ assertNotNull(met.get("Driver"));
+ assertEquals(expectedDriver, met.get("Driver"));
+ assumeTrue(met.get("Files") != null);
+ assertNotNull(met.get("Coordinate System"));
+ assertEquals(expectedCoordinateSystem, met.get("Coordinate System"));
+ assertNotNull(met.get("Size"));
+ assertEquals(expectedSize, met.get("Size"));
+ assertNotNull(met.get("Upper Right"));
+ assertEquals(expectedUpperRight, met.get("Upper Right"));
+ assertNotNull(met.get("Upper Left"));
+ assertEquals(expectedUpperLeft, met.get("Upper Left"));
+ assertNotNull(met.get("Upper Right"));
+ assertEquals(expectedLowerRight, met.get("Lower Right"));
+ assertNotNull(met.get("Upper Right"));
+ assertEquals(expectedLowerLeft, met.get("Lower Left"));
+
+ }
+
+ @Test
+ public void testParseMetadata() {
+ assumeTrue(canRun());
+ final String expectedNcInst = "NCAR (National Center for Atmospheric Research, Boulder, CO, USA)";
+ final String expectedModelNameEnglish = "NCAR CCSM";
+ final String expectedProgramId = "Source file unknown Version unknown Date unknown";
+ final String expectedProjectId = "IPCC Fourth Assessment";
+ final String expectedRealization = "1";
+ final String expectedTitle = "model output prepared for IPCC AR4";
+ final String expectedSub8Name = "\":ua";
+ final String expectedSub8Desc = "[1x17x128x256] eastward_wind (32-bit floating-point)";
+
+ GDALParser parser = new GDALParser();
+ InputStream stream = TestGDALParser.class
+ .getResourceAsStream("/test-documents/sresa1b_ncar_ccsm3_0_run1_200001.nc");
+ Metadata met = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+ try {
+ parser.parse(stream, handler, met, new ParseContext());
+ assertNotNull(met);
+ assertNotNull(met.get("NC_GLOBAL#institution"));
+ assertEquals(expectedNcInst, met.get("NC_GLOBAL#institution"));
+ assertNotNull(met.get("NC_GLOBAL#model_name_english"));
+ assertEquals(expectedModelNameEnglish,
+ met.get("NC_GLOBAL#model_name_english"));
+ assertNotNull(met.get("NC_GLOBAL#prg_ID"));
+ assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
+ assertNotNull(met.get("NC_GLOBAL#prg_ID"));
+ assertEquals(expectedProgramId, met.get("NC_GLOBAL#prg_ID"));
+ assertNotNull(met.get("NC_GLOBAL#project_id"));
+ assertEquals(expectedProjectId, met.get("NC_GLOBAL#project_id"));
+ assertNotNull(met.get("NC_GLOBAL#realization"));
+ assertEquals(expectedRealization, met.get("NC_GLOBAL#realization"));
+ assertNotNull(met.get("NC_GLOBAL#title"));
+ assertEquals(expectedTitle, met.get("NC_GLOBAL#title"));
+ assertNotNull(met.get("SUBDATASET_8_NAME"));
+ assertTrue(met.get("SUBDATASET_8_NAME").endsWith(expectedSub8Name));
+ assertNotNull(met.get("SUBDATASET_8_DESC"));
+ assertEquals(expectedSub8Desc, met.get("SUBDATASET_8_DESC"));
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+
+ @Test
+ public void testParseFITS() {
+ String fitsFilename = "/test-documents/WFPC2u5780205r_c0fx.fits";
+
+ assumeTrue(canRun());
+ // If the exit code is 1 (meaning FITS isn't supported by the installed version of gdalinfo, don't run this test.
+ String[] fitsCommand = {"gdalinfo", TestGDALParser.class.getResource(fitsFilename).getPath()};
+ assumeTrue(ExternalParser.check(fitsCommand, 1));
+
+ String expectedAllgMin = "-7.319537E1";
+ String expectedAtodcorr = "COMPLETE";
+ String expectedAtodfile = "uref$dbu1405iu.r1h";
+ String expectedCalVersion = " ";
+ String expectedCalibDef = "1466";
+
+ GDALParser parser = new GDALParser();
+ InputStream stream = TestGDALParser.class
+ .getResourceAsStream(fitsFilename);
+ Metadata met = new Metadata();
+ BodyContentHandler handler = new BodyContentHandler();
+ try {
+ parser.parse(stream, handler, met, new ParseContext());
+ assertNotNull(met);
+ assertNotNull(met.get("ALLG-MIN"));
+ assertEquals(expectedAllgMin, met.get("ALLG-MIN"));
+ assertNotNull(met.get("ATODCORR"));
+ assertEquals(expectedAtodcorr, met.get("ATODCORR"));
+ assertNotNull(met.get("ATODFILE"));
+ assertEquals(expectedAtodfile, met.get("ATODFILE"));
+ assertNotNull(met.get("CAL_VER"));
+ assertEquals(expectedCalVersion, met.get("CAL_VER"));
+ assertNotNull(met.get("CALIBDEF"));
+ assertEquals(expectedCalibDef, met.get("CALIBDEF"));
+
+ } catch (Exception e) {
+ e.printStackTrace();
+ fail(e.getMessage());
+ }
+ }
+}