You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2008/12/04 20:50:43 UTC

svn commit: r723410 - in /lucene/solr/trunk/contrib/dataimporthandler: CHANGES.txt src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java

Author: shalin
Date: Thu Dec  4 11:50:43 2008
New Revision: 723410

URL: http://svn.apache.org/viewvc?rev=723410&view=rev
Log:
SOLR-887 -- A Transformer to strip HTML tags.

Added:
    lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java   (with props)
Modified:
    lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt

Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=723410&r1=723409&r2=723410&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Thu Dec  4 11:50:43 2008
@@ -32,6 +32,9 @@
               residing as CLOBs or BLOBs in databases.
               (Noble Paul via shalin)
 
+5. SOLR-887:  A Transformer to strip HTML tags.
+              (Ahmed Hammad via shalin)
+
 Optimizations
 ----------------------
 

Added: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java?rev=723410&view=auto
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java (added)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java Thu Dec  4 11:50:43 2008
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.dataimport;
+
+import org.apache.solr.analysis.HTMLStripReader;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A Transformer implementation which strip off HTML tags using org.apache.solr.analysis.HTMLStripReader This is useful
+ * in case you don't need this HTML anyway.
+ *
+ * @version $Id$
+ * @see org.apache.solr.analysis.HTMLStripReader
+ * @since solr 1.4
+ */
+public class HTMLStripTransformer extends Transformer {
+
+  @Override
+  @SuppressWarnings("unchecked")
+  public Object transformRow(Map<String, Object> row, Context context) {
+    List<Map<String, String>> fields = context.getAllEntityFields();
+    for (Map<String, String> field : fields) {
+      String col = field.get(DataImporter.COLUMN);
+      String splitHTML = field.get(STRIP_HTML);
+      if (!TRUE.equals(splitHTML))
+        continue;
+      Object tmpVal = row.get(col);
+      if (tmpVal == null)
+        continue;
+
+      if (tmpVal instanceof List) {
+        List<String> inputs = (List<String>) tmpVal;
+        List results = new ArrayList();
+        for (String input : inputs) {
+          Object o = stripHTML(input, col);
+          if (o != null)
+            results.add(o);
+        }
+        row.put(col, results);
+      } else {
+        String value = tmpVal.toString();
+        Object o = stripHTML(value, col);
+        if (o != null)
+          row.put(col, o);
+      }
+    }
+    return row;
+  }
+
+  private Object stripHTML(String value, String column) {
+    StringBuilder out = new StringBuilder();
+    StringReader strReader = new StringReader(value);
+    try {
+      HTMLStripReader html = new HTMLStripReader(strReader);
+      char[] cbuf = new char[1024 * 10];
+      while (true) {
+        int count = html.read(cbuf);
+        if (count == -1)
+          break; // end of stream mark is -1
+        if (count > 0)
+          out.append(cbuf, 0, count);
+      }
+      html.close();
+    } catch (IOException e) {
+      throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
+              "Failed stripping HTML for column: " + column, e);
+    }
+    return out.toString();
+  }
+
+  public static final String STRIP_HTML = "stripHTML";
+
+  public static final String TRUE = "true";
+}

Propchange: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL