You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by gs...@apache.org on 2010/03/10 17:18:09 UTC

svn commit: r921425 - in /lucene/solr/trunk/contrib/extraction: CHANGES.txt lib/icu4j-4_2_1.jar src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java src/test/resources/arabic.pdf

Author: gsingers
Date: Wed Mar 10 16:18:09 2010
New Revision: 921425

URL: http://svn.apache.org/viewvc?rev=921425&view=rev
Log:
SOLR-1318: Added ICU4J to extraction and test for Arabic

Added:
    lucene/solr/trunk/contrib/extraction/lib/icu4j-4_2_1.jar   (with props)
    lucene/solr/trunk/contrib/extraction/src/test/resources/arabic.pdf   (with props)
Modified:
    lucene/solr/trunk/contrib/extraction/CHANGES.txt
    lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java

Modified: lucene/solr/trunk/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/CHANGES.txt?rev=921425&r1=921424&r2=921425&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/extraction/CHANGES.txt (original)
+++ lucene/solr/trunk/contrib/extraction/CHANGES.txt Wed Mar 10 16:18:09 2010
@@ -29,6 +29,7 @@ $Id:$
 
 * SOLR-1738: Upgrade to Tika 0.6 (gsingers)
 
+* SOLR-18913: Add ICU4j to libs and add tests for Arabic extraction (Robert Muir via gsingers)
 
 ================== Release 1.4.0 ==================
 

Added: lucene/solr/trunk/contrib/extraction/lib/icu4j-4_2_1.jar
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/lib/icu4j-4_2_1.jar?rev=921425&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/trunk/contrib/extraction/lib/icu4j-4_2_1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java?rev=921425&r1=921424&r2=921425&view=diff
==============================================================================
--- lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java (original)
+++ lucene/solr/trunk/contrib/extraction/src/test/java/org/apache/solr/handler/ExtractingRequestHandlerTest.java Wed Mar 10 16:18:09 2010
@@ -322,6 +322,22 @@ public class ExtractingRequestHandlerTes
     assertTrue(val + " is not equal to " + "linkNews", val.equals("linkNews") == true);//there are two <a> tags, and they get collapesd
   }
 
+  /** test arabic PDF extraction is functional */
+  public void testArabicPDF() throws Exception {
+    ExtractingRequestHandler handler = (ExtractingRequestHandler) 
+      h.getCore().getRequestHandler("/update/extract");
+    assertTrue("handler is null and it shouldn't be", handler != null);
+
+    loadLocal("arabic.pdf", "fmap.created", "extractedDate", "fmap.producer", "extractedProducer",
+        "fmap.creator", "extractedCreator", "fmap.Keywords", "extractedKeywords",
+        "fmap.Author", "extractedAuthor",
+        "fmap.content", "wdf_nocase",
+       "literal.id", "one",
+        "fmap.Last-Modified", "extractedDate");
+    assertQ(req("wdf_nocase:السلم"), "//result[@numFound=0]");
+    assertU(commit());
+    assertQ(req("wdf_nocase:السلم"), "//result[@numFound=1]");
+  }
 
   SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
     LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);

Added: lucene/solr/trunk/contrib/extraction/src/test/resources/arabic.pdf
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/contrib/extraction/src/test/resources/arabic.pdf?rev=921425&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/solr/trunk/contrib/extraction/src/test/resources/arabic.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/pdf