You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/29 11:11:42 UTC
[36/39] tika git commit: Convert new lines from windows to unix
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java b/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
index cc22347..1695859 100644
--- a/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-advanced-module/src/main/java/org/apache/tika/module/advanced/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.advanced.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.advanced.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/pom.xml b/tika-parser-modules/tika-parser-cad-module/pom.xml
index 6e7efb6..a9f8f31 100644
--- a/tika-parser-modules/tika-parser-cad-module/pom.xml
+++ b/tika-parser-modules/tika-parser-cad-module/pom.xml
@@ -1,56 +1,56 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-cad-module</artifactId>
- <name>Apache Tika parser CAD module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <dependency>
- <groupId>commons-lang</groupId>
- <artifactId>commons-lang</artifactId>
- <version>2.6</version>
- </dependency>
-
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-cad-module</artifactId>
+ <name>Apache Tika parser CAD module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ <version>2.6</version>
+ </dependency>
+
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
index 4a23b73..29a099c 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/module/cad/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.cad.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.cad.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 3f29c1f..875c4ee 100644
--- a/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parser-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -1,356 +1,356 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.dwg;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.commons.io.IOUtils;
-import org.apache.tika.io.StringUtil;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.EndianUtils;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.Property;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * DWG (CAD Drawing) parser. This is a very basic parser, which just
- * looks for bits of the headers.
- * Note that we use Apache POI for various parts of the processing, as
- * lots of the low level string/int/short concepts are the same.
- */
-public class DWGParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -7744232583079169119L;
-
- private static MediaType TYPE = MediaType.image("vnd.dwg");
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return Collections.singleton(TYPE);
- }
-
- /** The order of the fields in the header */
- private static final Property[] HEADER_PROPERTIES_ENTRIES = {
- TikaCoreProperties.TITLE,
- TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
- TikaCoreProperties.CREATOR,
- TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
- TikaCoreProperties.COMMENTS,
- TikaCoreProperties.MODIFIER,
- null, // Unknown?
- TikaCoreProperties.RELATION, // Hyperlink
- };
-
- /** For the 2000 file, they're indexed */
- private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
- null,
- TikaCoreProperties.RELATION, // 0x01
- TikaCoreProperties.TITLE, // 0x02
- TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03
- TikaCoreProperties.CREATOR, // 0x04
- null,
- TikaCoreProperties.COMMENTS,// 0x06
- TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07
- TikaCoreProperties.MODIFIER, // 0x08
- };
-
- private static final String HEADER_2000_PROPERTIES_MARKER_STR =
- "DWGPROPS COOKIE";
-
- private static final byte[] HEADER_2000_PROPERTIES_MARKER =
- new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
-
- static {
- StringUtil.putCompressedUnicode(
- HEADER_2000_PROPERTIES_MARKER_STR,
- HEADER_2000_PROPERTIES_MARKER, 0);
- }
-
- /**
- * How far to skip after the last standard property, before
- * we find any custom properties that might be there.
- */
- private static final int CUSTOM_PROPERTIES_SKIP = 20;
-
- /**
- * The value of padding bytes other than 0 in some DWG files.
- */
- private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, TikaException, SAXException {
- // First up, which version of the format are we handling?
- byte[] header = new byte[128];
- IOUtils.readFully(stream, header);
- String version = new String(header, 0, 6, "US-ASCII");
-
- XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
- xhtml.startDocument();
-
- if (version.equals("AC1015")) {
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipTo2000PropertyInfoSection(stream, header)) {
- get2000Props(stream,metadata,xhtml);
- }
- } else if (version.equals("AC1018")) {
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipToPropertyInfoSection(stream, header)) {
- get2004Props(stream,metadata,xhtml);
- }
- } else if (version.equals("AC1021") || version.equals("AC1024")) {
- metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
- if (skipToPropertyInfoSection(stream, header)) {
- get2007and2010Props(stream,metadata,xhtml);
- }
- } else {
- throw new TikaException(
- "Unsupported AutoCAD drawing version: " + version);
- }
-
- xhtml.endDocument();
- }
-
- /**
- * Stored as US-ASCII
- */
- private void get2004Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, TikaException, SAXException {
- // Standard properties
- for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
- String headerValue = read2004String(stream);
- handleHeader(i, headerValue, metadata, xhtml);
- }
-
- // Custom properties
- int customCount = skipToCustomProperties(stream);
- for (int i = 0; i < customCount; i++) {
- String propName = read2004String(stream);
- String propValue = read2004String(stream);
- if(propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
- }
- }
- }
-
- private String read2004String(InputStream stream) throws IOException, TikaException {
- int stringLen = EndianUtils.readUShortLE(stream);
-
- byte[] stringData = new byte[stringLen];
- IOUtils.readFully(stream, stringData);
-
- // Often but not always null terminated
- if (stringData[stringLen-1] == 0) {
- stringLen--;
- }
- String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
- return value;
- }
-
- /**
- * Stored as UCS2, so 16 bit "unicode"
- */
- private void get2007and2010Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, TikaException, SAXException {
- // Standard properties
- for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
- String headerValue = read2007and2010String(stream);
- handleHeader(i, headerValue, metadata, xhtml);
- }
-
- // Custom properties
- int customCount = skipToCustomProperties(stream);
- for (int i = 0; i < customCount; i++) {
- String propName = read2007and2010String(stream);
- String propValue = read2007and2010String(stream);
- if(propName.length() > 0 && propValue.length() > 0) {
- metadata.add(propName, propValue);
- }
- }
- }
-
- private String read2007and2010String(InputStream stream) throws IOException, TikaException {
- int stringLen = EndianUtils.readUShortLE(stream);
-
- byte[] stringData = new byte[stringLen * 2];
- IOUtils.readFully(stream, stringData);
- String value = StringUtil.getFromUnicodeLE(stringData);
-
- // Some strings are null terminated
- if(value.charAt(value.length()-1) == 0) {
- value = value.substring(0, value.length()-1);
- }
-
- return value;
- }
-
- private void get2000Props(
- InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
- throws IOException, TikaException, SAXException {
- int propCount = 0;
- while(propCount < 30) {
- int propIdx = EndianUtils.readUShortLE(stream);
- int length = EndianUtils.readUShortLE(stream);
- int valueType = stream.read();
-
- if(propIdx == 0x28) {
- // This one seems not to follow the pattern
- length = 0x19;
- } else if(propIdx == 90) {
- // We think this means the end of properties
- break;
- }
-
- byte[] value = new byte[length];
- IOUtils.readFully(stream, value);
- if(valueType == 0x1e) {
- // Normal string, good
- String val = StringUtil.getFromCompressedUnicode(value, 0, length);
-
- // Is it one we can look up by index?
- if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
- metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
- xhtml.element("p", val);
- } else if(propIdx == 0x012c) {
- int splitAt = val.indexOf('=');
- if(splitAt > -1) {
- String propName = val.substring(0, splitAt);
- String propVal = val.substring(splitAt+1);
- metadata.add(propName, propVal);
- }
- }
- } else {
- // No idea...
- }
-
- propCount++;
- }
- }
-
- private void handleHeader(
- int headerNumber, String value, Metadata metadata,
- XHTMLContentHandler xhtml) throws SAXException {
- if(value == null || value.length() == 0) {
- return;
- }
-
- Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
- if(headerProp != null) {
- metadata.set(headerProp, value);
- }
-
- xhtml.element("p", value);
- }
-
- /**
- * Grab the offset, then skip there
- */
- private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
- throws IOException, TikaException {
- // The offset is stored in the header from 0x20 onwards
- long offsetToSection = EndianUtils.getLongLE(header, 0x20);
-
- // Sanity check the offset. Some files seem to use a different format,
- // and the offset isn't available at 0x20. Until we can work out how
- // to find the offset in those files, skip them if detected
- if (offsetToSection > 0xa00000l) {
- // Header should never be more than 10mb into the file, something is wrong
- offsetToSection = 0;
- }
-
- // Work out how far to skip, and sanity check
- long toSkip = offsetToSection - header.length;
- if(offsetToSection == 0){
- return false;
- }
- while (toSkip > 0) {
- byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
- IOUtils.readFully(stream, skip);
- toSkip -= skip.length;
- }
- return true;
- }
-
- /**
- * We think it can be anywhere...
- */
- private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
- throws IOException {
- int val = 0;
- while(val != -1) {
- val = stream.read();
- if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
- boolean going = true;
- for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
- val = stream.read();
- if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
- }
- if(going) {
- // Bingo, found it
- return true;
- }
- }
- }
- return false;
- }
-
- private int skipToCustomProperties(InputStream stream)
- throws IOException, TikaException {
- // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
- byte[] padding = new byte[4];
- IOUtils.readFully(stream, padding);
- if((padding[0] == 0 && padding[1] == 0 &&
- padding[2] == 0 && padding[3] == 0) ||
- (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
- padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
- padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
- padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
-
- // Looks hopeful, skip on
- padding = new byte[CUSTOM_PROPERTIES_SKIP];
- IOUtils.readFully(stream, padding);
-
- // We should now have the count
- int count = EndianUtils.readUShortLE(stream);
-
- // Sanity check it
- if(count > 0 && count < 0x7f) {
- // Looks plausible
- return count;
- } else {
- // No properties / count is too high to trust
- return 0;
- }
- } else {
- // No padding. That probably means no custom props
- return 0;
- }
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.dwg;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.tika.io.StringUtil;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.EndianUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Property;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * DWG (CAD Drawing) parser. This is a very basic parser, which just
+ * looks for bits of the headers.
+ * Note that we use Apache POI for various parts of the processing, as
+ * lots of the low level string/int/short concepts are the same.
+ */
+public class DWGParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -7744232583079169119L;
+
+ private static MediaType TYPE = MediaType.image("vnd.dwg");
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return Collections.singleton(TYPE);
+ }
+
+ /** The order of the fields in the header */
+ private static final Property[] HEADER_PROPERTIES_ENTRIES = {
+ TikaCoreProperties.TITLE,
+ TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION,
+ TikaCoreProperties.CREATOR,
+ TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT,
+ TikaCoreProperties.COMMENTS,
+ TikaCoreProperties.MODIFIER,
+ null, // Unknown?
+ TikaCoreProperties.RELATION, // Hyperlink
+ };
+
+ /** For the 2000 file, they're indexed */
+ private static final Property[] HEADER_2000_PROPERTIES_ENTRIES = {
+ null,
+ TikaCoreProperties.RELATION, // 0x01
+ TikaCoreProperties.TITLE, // 0x02
+ TikaCoreProperties.TRANSITION_SUBJECT_TO_DC_DESCRIPTION, // 0x03
+ TikaCoreProperties.CREATOR, // 0x04
+ null,
+ TikaCoreProperties.COMMENTS,// 0x06
+ TikaCoreProperties.TRANSITION_KEYWORDS_TO_DC_SUBJECT, // 0x07
+ TikaCoreProperties.MODIFIER, // 0x08
+ };
+
+ private static final String HEADER_2000_PROPERTIES_MARKER_STR =
+ "DWGPROPS COOKIE";
+
+ private static final byte[] HEADER_2000_PROPERTIES_MARKER =
+ new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
+
+ static {
+ StringUtil.putCompressedUnicode(
+ HEADER_2000_PROPERTIES_MARKER_STR,
+ HEADER_2000_PROPERTIES_MARKER, 0);
+ }
+
+ /**
+ * How far to skip after the last standard property, before
+ * we find any custom properties that might be there.
+ */
+ private static final int CUSTOM_PROPERTIES_SKIP = 20;
+
+ /**
+ * The value of padding bytes other than 0 in some DWG files.
+ */
+ private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[] {0x2, 0, 0, 0};
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, TikaException, SAXException {
+ // First up, which version of the format are we handling?
+ byte[] header = new byte[128];
+ IOUtils.readFully(stream, header);
+ String version = new String(header, 0, 6, "US-ASCII");
+
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ xhtml.startDocument();
+
+ if (version.equals("AC1015")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipTo2000PropertyInfoSection(stream, header)) {
+ get2000Props(stream,metadata,xhtml);
+ }
+ } else if (version.equals("AC1018")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2004Props(stream,metadata,xhtml);
+ }
+ } else if (version.equals("AC1021") || version.equals("AC1024")) {
+ metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
+ if (skipToPropertyInfoSection(stream, header)) {
+ get2007and2010Props(stream,metadata,xhtml);
+ }
+ } else {
+ throw new TikaException(
+ "Unsupported AutoCAD drawing version: " + version);
+ }
+
+ xhtml.endDocument();
+ }
+
+ /**
+ * Stored as US-ASCII
+ */
+ private void get2004Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ // Standard properties
+ for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+ String headerValue = read2004String(stream);
+ handleHeader(i, headerValue, metadata, xhtml);
+ }
+
+ // Custom properties
+ int customCount = skipToCustomProperties(stream);
+ for (int i = 0; i < customCount; i++) {
+ String propName = read2004String(stream);
+ String propValue = read2004String(stream);
+ if(propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
+ }
+ }
+
+ private String read2004String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
+
+ byte[] stringData = new byte[stringLen];
+ IOUtils.readFully(stream, stringData);
+
+ // Often but not always null terminated
+ if (stringData[stringLen-1] == 0) {
+ stringLen--;
+ }
+ String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
+ return value;
+ }
+
+ /**
+ * Stored as UCS2, so 16 bit "unicode"
+ */
+ private void get2007and2010Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ // Standard properties
+ for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
+ String headerValue = read2007and2010String(stream);
+ handleHeader(i, headerValue, metadata, xhtml);
+ }
+
+ // Custom properties
+ int customCount = skipToCustomProperties(stream);
+ for (int i = 0; i < customCount; i++) {
+ String propName = read2007and2010String(stream);
+ String propValue = read2007and2010String(stream);
+ if(propName.length() > 0 && propValue.length() > 0) {
+ metadata.add(propName, propValue);
+ }
+ }
+ }
+
+ private String read2007and2010String(InputStream stream) throws IOException, TikaException {
+ int stringLen = EndianUtils.readUShortLE(stream);
+
+ byte[] stringData = new byte[stringLen * 2];
+ IOUtils.readFully(stream, stringData);
+ String value = StringUtil.getFromUnicodeLE(stringData);
+
+ // Some strings are null terminated
+ if(value.charAt(value.length()-1) == 0) {
+ value = value.substring(0, value.length()-1);
+ }
+
+ return value;
+ }
+
+ private void get2000Props(
+ InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, TikaException, SAXException {
+ int propCount = 0;
+ while(propCount < 30) {
+ int propIdx = EndianUtils.readUShortLE(stream);
+ int length = EndianUtils.readUShortLE(stream);
+ int valueType = stream.read();
+
+ if(propIdx == 0x28) {
+ // This one seems not to follow the pattern
+ length = 0x19;
+ } else if(propIdx == 90) {
+ // We think this means the end of properties
+ break;
+ }
+
+ byte[] value = new byte[length];
+ IOUtils.readFully(stream, value);
+ if(valueType == 0x1e) {
+ // Normal string, good
+ String val = StringUtil.getFromCompressedUnicode(value, 0, length);
+
+ // Is it one we can look up by index?
+ if(propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
+ metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
+ xhtml.element("p", val);
+ } else if(propIdx == 0x012c) {
+ int splitAt = val.indexOf('=');
+ if(splitAt > -1) {
+ String propName = val.substring(0, splitAt);
+ String propVal = val.substring(splitAt+1);
+ metadata.add(propName, propVal);
+ }
+ }
+ } else {
+ // No idea...
+ }
+
+ propCount++;
+ }
+ }
+
+ private void handleHeader(
+ int headerNumber, String value, Metadata metadata,
+ XHTMLContentHandler xhtml) throws SAXException {
+ if(value == null || value.length() == 0) {
+ return;
+ }
+
+ Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
+ if(headerProp != null) {
+ metadata.set(headerProp, value);
+ }
+
+ xhtml.element("p", value);
+ }
+
+ /**
+ * Grab the offset, then skip there
+ */
+ private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
+ throws IOException, TikaException {
+ // The offset is stored in the header from 0x20 onwards
+ long offsetToSection = EndianUtils.getLongLE(header, 0x20);
+
+ // Sanity check the offset. Some files seem to use a different format,
+ // and the offset isn't available at 0x20. Until we can work out how
+ // to find the offset in those files, skip them if detected
+ if (offsetToSection > 0xa00000l) {
+ // Header should never be more than 10mb into the file, something is wrong
+ offsetToSection = 0;
+ }
+
+ // Work out how far to skip, and sanity check
+ long toSkip = offsetToSection - header.length;
+ if(offsetToSection == 0){
+ return false;
+ }
+ while (toSkip > 0) {
+ byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
+ IOUtils.readFully(stream, skip);
+ toSkip -= skip.length;
+ }
+ return true;
+ }
+
+ /**
+ * We think it can be anywhere...
+ */
+ private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
+ throws IOException {
+ int val = 0;
+ while(val != -1) {
+ val = stream.read();
+ if(val == HEADER_2000_PROPERTIES_MARKER[0]) {
+ boolean going = true;
+ for(int i=1; i<HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
+ val = stream.read();
+ if(val != HEADER_2000_PROPERTIES_MARKER[i]) going = false;
+ }
+ if(going) {
+ // Bingo, found it
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private int skipToCustomProperties(InputStream stream)
+ throws IOException, TikaException {
+ // There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
+ byte[] padding = new byte[4];
+ IOUtils.readFully(stream, padding);
+ if((padding[0] == 0 && padding[1] == 0 &&
+ padding[2] == 0 && padding[3] == 0) ||
+ (padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
+ padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
+ padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
+ padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
+
+ // Looks hopeful, skip on
+ padding = new byte[CUSTOM_PROPERTIES_SKIP];
+ IOUtils.readFully(stream, padding);
+
+ // We should now have the count
+ int count = EndianUtils.readUShortLE(stream);
+
+ // Sanity check it
+ if(count > 0 && count < 0x7f) {
+ // Looks plausible
+ return count;
+ } else {
+ // No properties / count is too high to trust
+ return 0;
+ }
+ } else {
+ // No padding. That probably means no custom props
+ return 0;
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/pom.xml
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/pom.xml b/tika-parser-modules/tika-parser-code-module/pom.xml
index cf59c0e..5d33f82 100644
--- a/tika-parser-modules/tika-parser-code-module/pom.xml
+++ b/tika-parser-modules/tika-parser-code-module/pom.xml
@@ -1,69 +1,69 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- you under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
-
- <parent>
- <groupId>org.apache.tika</groupId>
- <artifactId>tika-parser-modules</artifactId>
- <version>2.0-SNAPSHOT</version>
- </parent>
-
- <artifactId>tika-parser-code-module</artifactId>
- <name>Apache Tika parser code module</name>
- <url>http://tika.apache.org/</url>
-
- <dependencies>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-core</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>org.ow2.asm</groupId>
- <artifactId>asm</artifactId>
- <version>5.0.4</version>
- </dependency>
- <dependency>
- <groupId>org.codelibs</groupId>
- <artifactId>jhighlight</artifactId>
- <version>1.0.2</version>
- </dependency>
- <dependency>
- <groupId>org.ccil.cowan.tagsoup</groupId>
- <artifactId>tagsoup</artifactId>
- <version>1.2.1</version>
- </dependency>
- <dependency>
- <groupId>commons-io</groupId>
- <artifactId>commons-io</artifactId>
- <version>${commons.io.version}</version>
- </dependency>
- <dependency>
- <groupId>${project.groupId}</groupId>
- <artifactId>tika-parser-text-module</artifactId>
- <version>${project.version}</version>
- <scope>test</scope>
- </dependency>
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-dependency-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parser-modules</artifactId>
+ <version>2.0-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>tika-parser-code-module</artifactId>
+ <name>Apache Tika parser code module</name>
+ <url>http://tika.apache.org/</url>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ow2.asm</groupId>
+ <artifactId>asm</artifactId>
+ <version>5.0.4</version>
+ </dependency>
+ <dependency>
+ <groupId>org.codelibs</groupId>
+ <artifactId>jhighlight</artifactId>
+ <version>1.0.2</version>
+ </dependency>
+ <dependency>
+ <groupId>org.ccil.cowan.tagsoup</groupId>
+ <artifactId>tagsoup</artifactId>
+ <version>1.2.1</version>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>${commons.io.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parser-text-module</artifactId>
+ <version>${project.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-dependency-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
</project>
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
index 040618d..095e643 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/module/code/internal/Activator.java
@@ -1,36 +1,36 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.module.code.internal;
-
-import org.apache.tika.osgi.TikaAbstractBundleActivator;
-import org.osgi.framework.BundleContext;
-
-public class Activator extends TikaAbstractBundleActivator {
-
- @Override
- public void start(BundleContext context) throws Exception {
-
- registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
-
- }
-
- @Override
- public void stop(BundleContext context) throws Exception {
-
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.module.code.internal;
+
+import org.apache.tika.osgi.TikaAbstractBundleActivator;
+import org.osgi.framework.BundleContext;
+
+public class Activator extends TikaAbstractBundleActivator {
+
+ @Override
+ public void start(BundleContext context) throws Exception {
+
+ registerTikaParserServiceLoader(context, Activator.class.getClassLoader());
+
+ }
+
+ @Override
+ public void stop(BundleContext context) throws Exception {
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
index 48f8cbf..481046f 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/ClassParser.java
@@ -1,54 +1,54 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.Set;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
-import org.apache.tika.parser.ParseContext;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Parser for Java .class files.
- */
-public class ClassParser extends AbstractParser {
-
- /** Serial version UID */
- private static final long serialVersionUID = -3531388963354454357L;
-
- private static final Set<MediaType> SUPPORTED_TYPES =
- Collections.singleton(MediaType.application("java-vm"));
-
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return SUPPORTED_TYPES;
- }
-
- public void parse(
- InputStream stream, ContentHandler handler,
- Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
- new XHTMLClassVisitor(handler, metadata).parse(stream);
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Set;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Parser for Java .class files.
+ */
+public class ClassParser extends AbstractParser {
+
+ /** Serial version UID */
+ private static final long serialVersionUID = -3531388963354454357L;
+
+ private static final Set<MediaType> SUPPORTED_TYPES =
+ Collections.singleton(MediaType.application("java-vm"));
+
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return SUPPORTED_TYPES;
+ }
+
+ public void parse(
+ InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ new XHTMLClassVisitor(handler, metadata).parse(stream);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
index 03deb43..c8ea317 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/asm/XHTMLClassVisitor.java
@@ -1,323 +1,323 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.asm;
-
-import java.io.IOException;
-import java.io.InputStream;
-
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.sax.XHTMLContentHandler;
-import org.objectweb.asm.AnnotationVisitor;
-import org.objectweb.asm.Attribute;
-import org.objectweb.asm.ClassReader;
-import org.objectweb.asm.ClassVisitor;
-import org.objectweb.asm.FieldVisitor;
-import org.objectweb.asm.MethodVisitor;
-import org.objectweb.asm.Opcodes;
-import org.objectweb.asm.Type;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-
-/**
- * Class visitor that generates XHTML SAX events to describe the
- * contents of the visited class.
- */
-class XHTMLClassVisitor extends ClassVisitor {
-
- private final XHTMLContentHandler xhtml;
-
- private final Metadata metadata;
-
- private Type type;
-
- private String packageName;
-
- public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
- super(Opcodes.ASM5);
- this.xhtml = new XHTMLContentHandler(handler, metadata);
- this.metadata = metadata;
- }
-
- public void parse(InputStream stream)
- throws TikaException, SAXException, IOException {
- try {
- ClassReader reader = new ClassReader(stream);
- reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
- } catch (RuntimeException e) {
- if (e.getCause() instanceof SAXException) {
- throw (SAXException) e.getCause();
- } else {
- throw new TikaException("Failed to parse a Java class", e);
- }
- }
- }
-
- public void visit(
- int version, int access, String name, String signature,
- String superName, String[] interfaces) {
- type = Type.getObjectType(name);
-
- String className = type.getClassName();
- int dot = className.lastIndexOf('.');
- if (dot != -1) {
- packageName = className.substring(0, dot);
- className = className.substring(dot + 1);
- }
-
- metadata.set(TikaCoreProperties.TITLE, className);
- metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
-
- try {
- xhtml.startDocument();
- xhtml.startElement("pre");
-
- if (packageName != null) {
- writeKeyword("package");
- xhtml.characters(" " + packageName + ";\n");
- }
-
- writeAccess(access);
- if (isSet(access, Opcodes.ACC_INTERFACE)) {
- writeKeyword("interface");
- writeSpace();
- writeType(type);
- writeSpace();
- writeInterfaces("extends", interfaces);
- } else if (isSet(access, Opcodes.ACC_ENUM)) {
- writeKeyword("enum");
- writeSpace();
- writeType(type);
- writeSpace();
- } else {
- writeKeyword("class");
- writeSpace();
- writeType(type);
- writeSpace();
- if (superName != null) {
- Type superType = Type.getObjectType(superName);
- if (!superType.getClassName().equals("java.lang.Object")) {
- writeKeyword("extends");
- writeSpace();
- writeType(superType);
- writeSpace();
- }
- }
- writeInterfaces("implements", interfaces);
- }
- xhtml.characters("{\n");
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- private void writeInterfaces(String keyword, String[] interfaces)
- throws SAXException {
- if (interfaces != null && interfaces.length > 0) {
- writeKeyword(keyword);
- String separator = " ";
- for (String iface : interfaces) {
- xhtml.characters(separator);
- writeType(Type.getObjectType(iface));
- separator = ", ";
- }
- writeSpace();
- }
- }
-
- public void visitEnd() {
- try {
- xhtml.characters("}\n");
- xhtml.endElement("pre");
- xhtml.endDocument();
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Ignored.
- */
- public void visitOuterClass(String owner, String name, String desc) {
- }
-
- /**
- * Ignored.
- */
- public void visitSource(String source, String debug) {
- }
-
-
- /**
- * Ignored.
- */
- public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
- return null;
- }
-
- /**
- * Ignored.
- */
- public void visitAttribute(Attribute attr) {
- }
-
- /**
- * Ignored.
- */
- public void visitInnerClass(
- String name, String outerName, String innerName, int access) {
- }
-
- /**
- * Visits a field.
- */
- public FieldVisitor visitField(
- int access, String name, String desc, String signature,
- Object value) {
- if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
- try {
- xhtml.characters(" ");
- writeAccess(access);
- writeType(Type.getType(desc));
- writeSpace();
- writeIdentifier(name);
-
- if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
- xhtml.characters(" = ");
- xhtml.characters(value.toString());
- }
-
- writeSemicolon();
- writeNewline();
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- return null;
- }
-
- /**
- * Visits a method.
- */
- public MethodVisitor visitMethod(
- int access, String name, String desc, String signature,
- String[] exceptions) {
- if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
- try {
- xhtml.characters(" ");
- writeAccess(access);
- writeType(Type.getReturnType(desc));
- writeSpace();
- if ("<init>".equals(name)) {
- writeType(type);
- } else {
- writeIdentifier(name);
- }
-
- xhtml.characters("(");
- String separator = "";
- for (Type arg : Type.getArgumentTypes(desc)) {
- xhtml.characters(separator);
- writeType(arg);
- separator = ", ";
- }
- xhtml.characters(")");
-
- if (exceptions != null && exceptions.length > 0) {
- writeSpace();
- writeKeyword("throws");
- separator = " ";
- for (String exception : exceptions) {
- xhtml.characters(separator);
- writeType(Type.getObjectType(exception));
- separator = ", ";
- }
- }
-
- writeSemicolon();
- writeNewline();
- } catch (SAXException e) {
- throw new RuntimeException(e);
- }
- }
-
- return null;
- }
-
- private void writeIdentifier(String identifier) throws SAXException {
- xhtml.startElement("span", "class", "java-identifier");
- xhtml.characters(identifier);
- xhtml.endElement("span");
- }
-
- private void writeKeyword(String keyword) throws SAXException {
- xhtml.startElement("span", "class", "java-keyword");
- xhtml.characters(keyword);
- xhtml.endElement("span");
- }
-
- private void writeSemicolon() throws SAXException {
- xhtml.characters(";");
- }
-
- private void writeSpace() throws SAXException {
- xhtml.characters(" ");
- }
-
- private void writeNewline() throws SAXException {
- xhtml.characters("\n");
- }
-
- private void writeAccess(int access) throws SAXException {
- writeAccess(access, Opcodes.ACC_PRIVATE, "private");
- writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
- writeAccess(access, Opcodes.ACC_PUBLIC, "public");
- writeAccess(access, Opcodes.ACC_STATIC, "static");
- writeAccess(access, Opcodes.ACC_FINAL, "final");
- writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
- writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
- writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
- writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
- writeAccess(access, Opcodes.ACC_NATIVE, "native");
- }
-
- private void writeAccess(int access, int code, String keyword)
- throws SAXException {
- if (isSet(access, code)) {
- writeKeyword(keyword);
- xhtml.characters(" ");
- }
- }
-
- private void writeType(Type type) throws SAXException {
- String name = type.getClassName();
- if (name.startsWith(packageName + ".")) {
- xhtml.characters(name.substring(packageName.length() + 1));
- } else if (name.startsWith("java.lang.")) {
- xhtml.characters(name.substring("java.lang.".length()));
- } else {
- xhtml.characters(name);
- }
- }
-
- private static boolean isSet(int value, int flag) {
- return (value & flag) != 0;
- }
-
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.asm;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.sax.XHTMLContentHandler;
+import org.objectweb.asm.AnnotationVisitor;
+import org.objectweb.asm.Attribute;
+import org.objectweb.asm.ClassReader;
+import org.objectweb.asm.ClassVisitor;
+import org.objectweb.asm.FieldVisitor;
+import org.objectweb.asm.MethodVisitor;
+import org.objectweb.asm.Opcodes;
+import org.objectweb.asm.Type;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+/**
+ * Class visitor that generates XHTML SAX events to describe the
+ * contents of the visited class.
+ */
+class XHTMLClassVisitor extends ClassVisitor {
+
+ private final XHTMLContentHandler xhtml;
+
+ private final Metadata metadata;
+
+ private Type type;
+
+ private String packageName;
+
+ public XHTMLClassVisitor(ContentHandler handler, Metadata metadata) {
+ super(Opcodes.ASM5);
+ this.xhtml = new XHTMLContentHandler(handler, metadata);
+ this.metadata = metadata;
+ }
+
+ public void parse(InputStream stream)
+ throws TikaException, SAXException, IOException {
+ try {
+ ClassReader reader = new ClassReader(stream);
+ reader.accept(this, ClassReader.SKIP_FRAMES | ClassReader.SKIP_CODE);
+ } catch (RuntimeException e) {
+ if (e.getCause() instanceof SAXException) {
+ throw (SAXException) e.getCause();
+ } else {
+ throw new TikaException("Failed to parse a Java class", e);
+ }
+ }
+ }
+
+ public void visit(
+ int version, int access, String name, String signature,
+ String superName, String[] interfaces) {
+ type = Type.getObjectType(name);
+
+ String className = type.getClassName();
+ int dot = className.lastIndexOf('.');
+ if (dot != -1) {
+ packageName = className.substring(0, dot);
+ className = className.substring(dot + 1);
+ }
+
+ metadata.set(TikaCoreProperties.TITLE, className);
+ metadata.set(Metadata.RESOURCE_NAME_KEY, className + ".class");
+
+ try {
+ xhtml.startDocument();
+ xhtml.startElement("pre");
+
+ if (packageName != null) {
+ writeKeyword("package");
+ xhtml.characters(" " + packageName + ";\n");
+ }
+
+ writeAccess(access);
+ if (isSet(access, Opcodes.ACC_INTERFACE)) {
+ writeKeyword("interface");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ writeInterfaces("extends", interfaces);
+ } else if (isSet(access, Opcodes.ACC_ENUM)) {
+ writeKeyword("enum");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ } else {
+ writeKeyword("class");
+ writeSpace();
+ writeType(type);
+ writeSpace();
+ if (superName != null) {
+ Type superType = Type.getObjectType(superName);
+ if (!superType.getClassName().equals("java.lang.Object")) {
+ writeKeyword("extends");
+ writeSpace();
+ writeType(superType);
+ writeSpace();
+ }
+ }
+ writeInterfaces("implements", interfaces);
+ }
+ xhtml.characters("{\n");
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void writeInterfaces(String keyword, String[] interfaces)
+ throws SAXException {
+ if (interfaces != null && interfaces.length > 0) {
+ writeKeyword(keyword);
+ String separator = " ";
+ for (String iface : interfaces) {
+ xhtml.characters(separator);
+ writeType(Type.getObjectType(iface));
+ separator = ", ";
+ }
+ writeSpace();
+ }
+ }
+
+ public void visitEnd() {
+ try {
+ xhtml.characters("}\n");
+ xhtml.endElement("pre");
+ xhtml.endDocument();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitOuterClass(String owner, String name, String desc) {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitSource(String source, String debug) {
+ }
+
+
+ /**
+ * Ignored.
+ */
+ public AnnotationVisitor visitAnnotation(String desc, boolean visible) {
+ return null;
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitAttribute(Attribute attr) {
+ }
+
+ /**
+ * Ignored.
+ */
+ public void visitInnerClass(
+ String name, String outerName, String innerName, int access) {
+ }
+
+ /**
+ * Visits a field.
+ */
+ public FieldVisitor visitField(
+ int access, String name, String desc, String signature,
+ Object value) {
+ if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+ try {
+ xhtml.characters(" ");
+ writeAccess(access);
+ writeType(Type.getType(desc));
+ writeSpace();
+ writeIdentifier(name);
+
+ if (isSet(access, Opcodes.ACC_STATIC) && value != null) {
+ xhtml.characters(" = ");
+ xhtml.characters(value.toString());
+ }
+
+ writeSemicolon();
+ writeNewline();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return null;
+ }
+
+ /**
+ * Visits a method.
+ */
+ public MethodVisitor visitMethod(
+ int access, String name, String desc, String signature,
+ String[] exceptions) {
+ if (!isSet(access, Opcodes.ACC_SYNTHETIC)) {
+ try {
+ xhtml.characters(" ");
+ writeAccess(access);
+ writeType(Type.getReturnType(desc));
+ writeSpace();
+ if ("<init>".equals(name)) {
+ writeType(type);
+ } else {
+ writeIdentifier(name);
+ }
+
+ xhtml.characters("(");
+ String separator = "";
+ for (Type arg : Type.getArgumentTypes(desc)) {
+ xhtml.characters(separator);
+ writeType(arg);
+ separator = ", ";
+ }
+ xhtml.characters(")");
+
+ if (exceptions != null && exceptions.length > 0) {
+ writeSpace();
+ writeKeyword("throws");
+ separator = " ";
+ for (String exception : exceptions) {
+ xhtml.characters(separator);
+ writeType(Type.getObjectType(exception));
+ separator = ", ";
+ }
+ }
+
+ writeSemicolon();
+ writeNewline();
+ } catch (SAXException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ return null;
+ }
+
+ private void writeIdentifier(String identifier) throws SAXException {
+ xhtml.startElement("span", "class", "java-identifier");
+ xhtml.characters(identifier);
+ xhtml.endElement("span");
+ }
+
+ private void writeKeyword(String keyword) throws SAXException {
+ xhtml.startElement("span", "class", "java-keyword");
+ xhtml.characters(keyword);
+ xhtml.endElement("span");
+ }
+
+ private void writeSemicolon() throws SAXException {
+ xhtml.characters(";");
+ }
+
+ private void writeSpace() throws SAXException {
+ xhtml.characters(" ");
+ }
+
+ private void writeNewline() throws SAXException {
+ xhtml.characters("\n");
+ }
+
+ private void writeAccess(int access) throws SAXException {
+ writeAccess(access, Opcodes.ACC_PRIVATE, "private");
+ writeAccess(access, Opcodes.ACC_PROTECTED, "protected");
+ writeAccess(access, Opcodes.ACC_PUBLIC, "public");
+ writeAccess(access, Opcodes.ACC_STATIC, "static");
+ writeAccess(access, Opcodes.ACC_FINAL, "final");
+ writeAccess(access, Opcodes.ACC_ABSTRACT, "abstract");
+ writeAccess(access, Opcodes.ACC_SYNCHRONIZED, "synchronized");
+ writeAccess(access, Opcodes.ACC_TRANSIENT, "transient");
+ writeAccess(access, Opcodes.ACC_VOLATILE, "volatile");
+ writeAccess(access, Opcodes.ACC_NATIVE, "native");
+ }
+
+ private void writeAccess(int access, int code, String keyword)
+ throws SAXException {
+ if (isSet(access, code)) {
+ writeKeyword(keyword);
+ xhtml.characters(" ");
+ }
+ }
+
+ private void writeType(Type type) throws SAXException {
+ String name = type.getClassName();
+ if (name.startsWith(packageName + ".")) {
+ xhtml.characters(name.substring(packageName.length() + 1));
+ } else if (name.startsWith("java.lang.")) {
+ xhtml.characters(name.substring("java.lang.".length()));
+ } else {
+ xhtml.characters(name);
+ }
+ }
+
+ private static boolean isSet(int value, int flag) {
+ return (value & flag) != 0;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/tika/blob/c7a6bcac/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
index 63e4bf6..d17bde7 100644
--- a/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
+++ b/tika-parser-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/code/SourceCodeParser.java
@@ -1,142 +1,142 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.code;
-
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
-import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.StringReader;
-import java.nio.charset.Charset;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.apache.commons.io.input.CloseShieldInputStream;
-import org.apache.tika.config.ServiceLoader;
-import org.apache.tika.detect.AutoDetectReader;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.ccil.cowan.tagsoup.HTMLSchema;
-import org.ccil.cowan.tagsoup.Schema;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.InputSource;
-import org.xml.sax.SAXException;
-
-import com.uwyn.jhighlight.renderer.Renderer;
-import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
-/**
- * Generic Source code parser for Java, Groovy, C++.
- * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
- *
- * @author Hong-Thai.Nguyen
- * @since 1.6
- */
-public class SourceCodeParser implements Parser {
-
- private static final long serialVersionUID = -4543476498190054160L;
-
- private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
-
- private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
- private static final long serialVersionUID = -741976157563751152L;
- {
- put(MediaType.text("x-c++src"), CPP);
- put(MediaType.text("x-java-source"), JAVA);
- put(MediaType.text("x-groovy"), GROOVY);
- }
- };
-
- private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
-
- //Parse the HTML document
- private static final Schema HTML_SCHEMA = new HTMLSchema();
-
- @Override
- public Set<MediaType> getSupportedTypes(ParseContext context) {
- return TYPES_TO_RENDERER.keySet();
- }
-
- @Override
- public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
- throws IOException, SAXException, TikaException {
-
- try (AutoDetectReader reader = new AutoDetectReader(
- new CloseShieldInputStream(stream), metadata,
- context.get(ServiceLoader.class, LOADER))) {
- Charset charset = reader.getCharset();
- String mediaType = metadata.get(Metadata.CONTENT_TYPE);
- String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
- if (mediaType != null && name != null) {
- MediaType type = MediaType.parse(mediaType);
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
- metadata.set(Metadata.CONTENT_ENCODING, charset.name());
-
- StringBuilder out = new StringBuilder();
- String line;
- int nbLines = 0;
- while ((line = reader.readLine()) != null) {
- out.append(line + System.getProperty("line.separator"));
- String author = parserAuthor(line);
- if (author != null) {
- metadata.add(TikaCoreProperties.CREATOR, author);
- }
- nbLines ++;
- }
- metadata.set("LoC", String.valueOf(nbLines));
- Renderer renderer = getRenderer(type.toString());
-
- String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
-
- Schema schema = context.get(Schema.class, HTML_SCHEMA);
-
- org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
- parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
- parser.setContentHandler(handler);
- parser.parse(new InputSource(new StringReader(codeAsHtml)));
- }
- }
-
- }
-
- private Renderer getRenderer(String mimeType) {
- MediaType mt = MediaType.parse(mimeType);
- String type = TYPES_TO_RENDERER.get(mt);
- if (type == null) {
- throw new RuntimeException("unparseable content type " + mimeType);
- }
- return XhtmlRendererFactory.getRenderer(type);
- }
-
-
- private String parserAuthor(String line) {
- Matcher m = authorPattern.matcher(line);
- if (m.find()) {
- return m.group(1).trim();
- }
-
- return null;
- }
-}
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.code;
+
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.CPP;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.GROOVY;
+import static com.uwyn.jhighlight.renderer.XhtmlRendererFactory.JAVA;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.input.CloseShieldInputStream;
+import org.apache.tika.config.ServiceLoader;
+import org.apache.tika.detect.AutoDetectReader;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.ccil.cowan.tagsoup.HTMLSchema;
+import org.ccil.cowan.tagsoup.Schema;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+import com.uwyn.jhighlight.renderer.Renderer;
+import com.uwyn.jhighlight.renderer.XhtmlRendererFactory;
+/**
+ * Generic Source code parser for Java, Groovy, C++.
+ * Aware: This parser uses JHightlight library (https://github.com/codelibs/jhighlight) under CDDL/LGPL dual license
+ *
+ * @author Hong-Thai.Nguyen
+ * @since 1.6
+ */
+public class SourceCodeParser implements Parser {
+
+ private static final long serialVersionUID = -4543476498190054160L;
+
+ private static final Pattern authorPattern = Pattern.compile("(?im)@author (.*) *$");
+
+ private static final Map<MediaType, String> TYPES_TO_RENDERER = new HashMap<MediaType, String>() {
+ private static final long serialVersionUID = -741976157563751152L;
+ {
+ put(MediaType.text("x-c++src"), CPP);
+ put(MediaType.text("x-java-source"), JAVA);
+ put(MediaType.text("x-groovy"), GROOVY);
+ }
+ };
+
+ private static final ServiceLoader LOADER = new ServiceLoader(SourceCodeParser.class.getClassLoader());
+
+ //Parse the HTML document
+ private static final Schema HTML_SCHEMA = new HTMLSchema();
+
+ @Override
+ public Set<MediaType> getSupportedTypes(ParseContext context) {
+ return TYPES_TO_RENDERER.keySet();
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+
+ try (AutoDetectReader reader = new AutoDetectReader(
+ new CloseShieldInputStream(stream), metadata,
+ context.get(ServiceLoader.class, LOADER))) {
+ Charset charset = reader.getCharset();
+ String mediaType = metadata.get(Metadata.CONTENT_TYPE);
+ String name = metadata.get(Metadata.RESOURCE_NAME_KEY);
+ if (mediaType != null && name != null) {
+ MediaType type = MediaType.parse(mediaType);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+ metadata.set(Metadata.CONTENT_ENCODING, charset.name());
+
+ StringBuilder out = new StringBuilder();
+ String line;
+ int nbLines = 0;
+ while ((line = reader.readLine()) != null) {
+ out.append(line + System.getProperty("line.separator"));
+ String author = parserAuthor(line);
+ if (author != null) {
+ metadata.add(TikaCoreProperties.CREATOR, author);
+ }
+ nbLines ++;
+ }
+ metadata.set("LoC", String.valueOf(nbLines));
+ Renderer renderer = getRenderer(type.toString());
+
+ String codeAsHtml = renderer.highlight(name, out.toString(), charset.name(), false);
+
+ Schema schema = context.get(Schema.class, HTML_SCHEMA);
+
+ org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();
+ parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
+ parser.setContentHandler(handler);
+ parser.parse(new InputSource(new StringReader(codeAsHtml)));
+ }
+ }
+
+ }
+
+ private Renderer getRenderer(String mimeType) {
+ MediaType mt = MediaType.parse(mimeType);
+ String type = TYPES_TO_RENDERER.get(mt);
+ if (type == null) {
+ throw new RuntimeException("unparseable content type " + mimeType);
+ }
+ return XhtmlRendererFactory.getRenderer(type);
+ }
+
+
+ private String parserAuthor(String line) {
+ Matcher m = authorPattern.matcher(line);
+ if (m.find()) {
+ return m.group(1).trim();
+ }
+
+ return null;
+ }
+}