You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by sh...@apache.org on 2008/12/04 20:50:43 UTC
svn commit: r723410 - in /lucene/solr/trunk/contrib/dataimporthandler:
CHANGES.txt
src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
Author: shalin
Date: Thu Dec 4 11:50:43 2008
New Revision: 723410
URL: http://svn.apache.org/viewvc?rev=723410&view=rev
Log:
SOLR-887 -- A Transformer to strip HTML tags.
Added:
lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java (with props)
Modified:
lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
Modified: lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt?rev=723410&r1=723409&r2=723410&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/dataimporthandler/CHANGES.txt Thu Dec 4 11:50:43 2008
@@ -32,6 +32,9 @@
residing as CLOBs or BLOBs in databases.
(Noble Paul via shalin)
+5. SOLR-887: A Transformer to strip HTML tags.
+ (Ahmed Hammad via shalin)
+
Optimizations
----------------------
Added: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java?rev=723410&view=auto
==============================================================================
--- lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java (added)
+++ lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java Thu Dec 4 11:50:43 2008
@@ -0,0 +1,93 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.handler.dataimport;
+
+import org.apache.solr.analysis.HTMLStripReader;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A Transformer implementation which strip off HTML tags using org.apache.solr.analysis.HTMLStripReader This is useful
+ * in case you don't need this HTML anyway.
+ *
+ * @version $Id$
+ * @see org.apache.solr.analysis.HTMLStripReader
+ * @since solr 1.4
+ */
+public class HTMLStripTransformer extends Transformer {
+
+ @Override
+ @SuppressWarnings("unchecked")
+ public Object transformRow(Map<String, Object> row, Context context) {
+ List<Map<String, String>> fields = context.getAllEntityFields();
+ for (Map<String, String> field : fields) {
+ String col = field.get(DataImporter.COLUMN);
+ String splitHTML = field.get(STRIP_HTML);
+ if (!TRUE.equals(splitHTML))
+ continue;
+ Object tmpVal = row.get(col);
+ if (tmpVal == null)
+ continue;
+
+ if (tmpVal instanceof List) {
+ List<String> inputs = (List<String>) tmpVal;
+ List results = new ArrayList();
+ for (String input : inputs) {
+ Object o = stripHTML(input, col);
+ if (o != null)
+ results.add(o);
+ }
+ row.put(col, results);
+ } else {
+ String value = tmpVal.toString();
+ Object o = stripHTML(value, col);
+ if (o != null)
+ row.put(col, o);
+ }
+ }
+ return row;
+ }
+
+ private Object stripHTML(String value, String column) {
+ StringBuilder out = new StringBuilder();
+ StringReader strReader = new StringReader(value);
+ try {
+ HTMLStripReader html = new HTMLStripReader(strReader);
+ char[] cbuf = new char[1024 * 10];
+ while (true) {
+ int count = html.read(cbuf);
+ if (count == -1)
+ break; // end of stream mark is -1
+ if (count > 0)
+ out.append(cbuf, 0, count);
+ }
+ html.close();
+ } catch (IOException e) {
+ throw new DataImportHandlerException(DataImportHandlerException.SEVERE,
+ "Failed stripping HTML for column: " + column, e);
+ }
+ return out.toString();
+ }
+
+ public static final String STRIP_HTML = "stripHTML";
+
+ public static final String TRUE = "true";
+}
Propchange: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/solr/trunk/contrib/dataimporthandler/src/main/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL