You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2018/08/14 17:39:03 UTC
svn commit: r1838040 -
/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Author: kwright
Date: Tue Aug 14 17:39:03 2018
New Revision: 1838040
URL: http://svn.apache.org/viewvc?rev=1838040&view=rev
Log:
Fix more formatting and logging statement issues
Modified:
manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
Modified: manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java?rev=1838040&r1=1838039&r2=1838040&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java (original)
+++ manifoldcf/trunk/connectors/html-extractor/connector/src/main/java/org/apache/manifoldcf/agents/transformation/htmlextractor/JsoupProcessing.java Tue Aug 14 17:39:03 2018
@@ -27,7 +27,7 @@ import java.util.ArrayList;
import java.util.Hashtable;
import java.util.List;
-import org.apache.manifoldcf.core.system.Logging;
+import org.apache.manifoldcf.crawler.system.Logging;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
@@ -38,144 +38,142 @@ public class JsoupProcessing {
- public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
- Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
- Hashtable<String,String> metadata = new Hashtable<String,String>();
- for(Element meta : doc.select("meta")) {
- Logging.root.warn("Name: " + meta.attr("name") + " - Content: " + meta.attr("content"));
- metadata.put(meta.attr("name"), meta.attr("content"));
- }
+ public static Hashtable<String,String> extractTextAndMetadataHtmlDocument(InputStream streamDoc,String whitelist,List<String> blacklist, boolean stripHtml) throws IOException{
+ Document doc = Jsoup.parse(streamDoc, "UTF-8", "");
+ Hashtable<String,String> metadata = new Hashtable<String,String>();
+ for(Element meta : doc.select("meta")) {
+ Logging.connectors.debug("Name: " + meta.attr("name") + " - Content: " + meta.attr("content"));
+ metadata.put(meta.attr("name"), meta.attr("content"));
+ }
- if (doc.select("title") != null){
- String title = doc.select("title").text();
- metadata.put("title", title);
- }
+ if (doc.select("title") != null){
+ String title = doc.select("title").text();
+ metadata.put("title", title);
+ }
- Element element_keywords = doc.select("meta[name='keywords']").first();
- Logging.root.warn("keywordsjsoupnounet");
- if (element_keywords != null) {
- String keywords = (element_keywords.attr("content"));
- Logging.root.warn("keyyyyyywords"+keywords);
- metadata.put("keywords",keywords);
- }
+ Element element_keywords = doc.select("meta[name='keywords']").first();
+ if (element_keywords != null) {
+ String keywords = (element_keywords.attr("content"));
+ metadata.put("keywords",keywords);
+ }
- Element element_description = doc.select("meta[name=\"description\"]").first();
- if (element_description != null) {
- String description = (element_description.attr("content"));
- metadata.put("description",description);
- }
+ Element element_description = doc.select("meta[name=\"description\"]").first();
+ if (element_description != null) {
+ String description = (element_description.attr("content"));
+ metadata.put("description",description);
+ }
- Element element_author = doc.select("meta[name=\"author\"]").first();
- if (element_author != null) {
- String author = (element_author.attr("content"));
- metadata.put("author",author);
- }
+ Element element_author = doc.select("meta[name=\"author\"]").first();
+ if (element_author != null) {
+ String author = (element_author.attr("content"));
+ metadata.put("author",author);
+ }
- Element element_dcterms_subject = doc.select("meta[name=\"dcterms.subject\"]").first();
- if (element_dcterms_subject != null) {
- String dc_terms_subject = (element_dcterms_subject.attr("content"));
- metadata.put("dc_terms_subject",dc_terms_subject);
- }
+ Element element_dcterms_subject = doc.select("meta[name=\"dcterms.subject\"]").first();
+ if (element_dcterms_subject != null) {
+ String dc_terms_subject = (element_dcterms_subject.attr("content"));
+ metadata.put("dc_terms_subject",dc_terms_subject);
+ }
- Element element_dcterms_title = doc.select("meta[name=\"dcterms.title\"]").first();
- if (element_dcterms_title != null) {
- String dc_terms_title = (element_dcterms_title.attr("content"));
- metadata.put("dc_terms_title",dc_terms_title);
+ Element element_dcterms_title = doc.select("meta[name=\"dcterms.title\"]").first();
+ if (element_dcterms_title != null) {
+ String dc_terms_title = (element_dcterms_title.attr("content"));
+ metadata.put("dc_terms_title",dc_terms_title);
- }
+ }
- Element element_dcterms_creator = doc.select("meta[name=\"dcterms.creator\"]").first();
- if (element_dcterms_creator != null) {
- String dc_terms_creator = (element_dcterms_creator.attr("content"));
- metadata.put("dc_terms_creator",dc_terms_creator);
+ Element element_dcterms_creator = doc.select("meta[name=\"dcterms.creator\"]").first();
+ if (element_dcterms_creator != null) {
+ String dc_terms_creator = (element_dcterms_creator.attr("content"));
+ metadata.put("dc_terms_creator",dc_terms_creator);
- }
+ }
- Element element_dcterms_description = doc.select("meta[name=\"dcterms.description\"]").first();
- if (element_dcterms_description != null) {
- String dc_terms_description = (element_dcterms_description.attr("content"));
- metadata.put("dc_terms_description",dc_terms_description);
+ Element element_dcterms_description = doc.select("meta[name=\"dcterms.description\"]").first();
+ if (element_dcterms_description != null) {
+ String dc_terms_description = (element_dcterms_description.attr("content"));
+ metadata.put("dc_terms_description",dc_terms_description);
- }
+ }
- Element element_dcterms_publisher = doc.select("meta[name=\"dcterms.publisher\"]").first();
- if (element_dcterms_publisher != null) {
- String dc_terms_publisher = (element_dcterms_publisher.attr("content"));
- metadata.put("dc_terms_publisher",dc_terms_publisher);
+ Element element_dcterms_publisher = doc.select("meta[name=\"dcterms.publisher\"]").first();
+ if (element_dcterms_publisher != null) {
+ String dc_terms_publisher = (element_dcterms_publisher.attr("content"));
+ metadata.put("dc_terms_publisher",dc_terms_publisher);
- }
+ }
- Element element_dcterms_contributor = doc.select("meta[name=\"dcterms.contributor\"]").first();
- if (element_dcterms_contributor != null) {
- String dc_terms_contributor = (element_dcterms_contributor.attr("content"));
- metadata.put("dc_terms_contributor",dc_terms_contributor);
+ Element element_dcterms_contributor = doc.select("meta[name=\"dcterms.contributor\"]").first();
+ if (element_dcterms_contributor != null) {
+ String dc_terms_contributor = (element_dcterms_contributor.attr("content"));
+ metadata.put("dc_terms_contributor",dc_terms_contributor);
- }
+ }
- Element element_dcterms_date = doc.select("meta[name=\"dcterms.date\"]").first();
- if (element_dcterms_date != null) {
- String dc_terms_date = (element_dcterms_date.attr("content"));
- metadata.put("dc_terms_date",dc_terms_date);
+ Element element_dcterms_date = doc.select("meta[name=\"dcterms.date\"]").first();
+ if (element_dcterms_date != null) {
+ String dc_terms_date = (element_dcterms_date.attr("content"));
+ metadata.put("dc_terms_date",dc_terms_date);
- }
+ }
- Element element_dcterms_type = doc.select("meta[name=\"dcterms.type\"]").first();
- if (element_dcterms_type != null) {
- String dc_terms_type = (element_dcterms_type.attr("content"));
- metadata.put("dc_terms_type",dc_terms_type);
+ Element element_dcterms_type = doc.select("meta[name=\"dcterms.type\"]").first();
+ if (element_dcterms_type != null) {
+ String dc_terms_type = (element_dcterms_type.attr("content"));
+ metadata.put("dc_terms_type",dc_terms_type);
- }
+ }
- Element element_dcterms_format = doc.select("meta[name=\"dcterms.format\"]").first();
- if (element_dcterms_format != null) {
- String dc_terms_format = (element_dcterms_format.attr("content"));
- metadata.put("dc_terms_format",dc_terms_format);
+ Element element_dcterms_format = doc.select("meta[name=\"dcterms.format\"]").first();
+ if (element_dcterms_format != null) {
+ String dc_terms_format = (element_dcterms_format.attr("content"));
+ metadata.put("dc_terms_format",dc_terms_format);
- }
+ }
- Element element_dcterms_language = doc.select("meta[name=\"dcterms.language\"]").first();
- if (element_dcterms_language != null) {
- String dc_terms_language = (element_dcterms_language.attr("content"));
- metadata.put("dc_terms_language",dc_terms_language);
+ Element element_dcterms_language = doc.select("meta[name=\"dcterms.language\"]").first();
+ if (element_dcterms_language != null) {
+ String dc_terms_language = (element_dcterms_language.attr("content"));
+ metadata.put("dc_terms_language",dc_terms_language);
- }
+ }
- Element element_dcterms_identifier = doc.select("meta[name=\"dcterms.identifier\"]").first();
- if (element_dcterms_identifier != null) {
- String dc_terms_identifier = (element_dcterms_identifier.attr("content"));
- metadata.put("dc_terms_identifier",dc_terms_identifier);
- }
+ Element element_dcterms_identifier = doc.select("meta[name=\"dcterms.identifier\"]").first();
+ if (element_dcterms_identifier != null) {
+ String dc_terms_identifier = (element_dcterms_identifier.attr("content"));
+ metadata.put("dc_terms_identifier",dc_terms_identifier);
+ }
- Element docToKeep = doc.body();
- String finalDoc ;
+ Element docToKeep = doc.body();
+ String finalDoc ;
- // Englobing Tag
- if (whitelist!="body"){
- docToKeep = doc.select(whitelist).first();
- }
+ // Englobing Tag
+ if (whitelist!="body"){
+ docToKeep = doc.select(whitelist).first();
+ }
- // Blacklist
- if (blacklist != null){
- for (int i=0; i< blacklist.size();i++){
- docToKeep.select(blacklist.get(i)).remove();
- }
- }
+ // Blacklist
+ if (blacklist != null){
+ for (int i=0; i< blacklist.size();i++){
+ docToKeep.select(blacklist.get(i)).remove();
+ }
+ }
- if (stripHtml)
- finalDoc = docToKeep.text();
- else
- finalDoc = docToKeep.html();
-
-
- metadata.put("extractedDoc",finalDoc);
+ if (stripHtml)
+ finalDoc = docToKeep.text();
+ else
+ finalDoc = docToKeep.html();
+
+
+ metadata.put("extractedDoc",finalDoc);
- return metadata;
- }
+ return metadata;
+ }
}
\ No newline at end of file