You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ja...@apache.org on 2012/01/25 15:50:02 UTC
svn commit: r1235777 - in /lucene/dev/branches/branch_3x: ./
dev-tools/eclipse/ dev-tools/maven/ lucene/
lucene/contrib/analyzers/kuromoji/ solr/
solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/
solr/contrib/extraction...
Author: janhoy
Date: Wed Jan 25 14:50:01 2012
New Revision: 1235777
URL: http://svn.apache.org/viewvc?rev=1235777&view=rev
Log:
SOLR-2901: Upgrade Solr to Tika 1.0 (backport from trunk with svn merge)
Added:
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/commons-compress-1.3.jar
- copied unchanged from r1235753, lucene/dev/trunk/solr/contrib/extraction/lib/commons-compress-1.3.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-core-1.0.jar
- copied unchanged from r1235753, lucene/dev/trunk/solr/contrib/extraction/lib/tika-core-1.0.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-parsers-1.0.jar
- copied unchanged from r1235753, lucene/dev/trunk/solr/contrib/extraction/lib/tika-parsers-1.0.jar
Removed:
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/commons-compress-1.2.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-core-0.10.jar
lucene/dev/branches/branch_3x/solr/contrib/extraction/lib/tika-parsers-0.10.jar
Modified:
lucene/dev/branches/branch_3x/ (props changed)
lucene/dev/branches/branch_3x/dev-tools/eclipse/dot.classpath
lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template
lucene/dev/branches/branch_3x/lucene/ (props changed)
lucene/dev/branches/branch_3x/lucene/contrib/analyzers/kuromoji/ (props changed)
lucene/dev/branches/branch_3x/solr/ (props changed)
lucene/dev/branches/branch_3x/solr/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java
lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
lucene/dev/branches/branch_3x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
lucene/dev/branches/branch_3x/solr/core/ (props changed)
Modified: lucene/dev/branches/branch_3x/dev-tools/eclipse/dot.classpath
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/dev-tools/eclipse/dot.classpath?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/dev-tools/eclipse/dot.classpath (original)
+++ lucene/dev/branches/branch_3x/dev-tools/eclipse/dot.classpath Wed Jan 25 14:50:01 2012
@@ -132,7 +132,7 @@
<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcmail-jdk15-1.45.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/bcprov-jdk15-1.45.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/boilerpipe-1.1.0.jar"/>
- <classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.2.jar"/>
+ <classpathentry kind="lib" path="solr/contrib/extraction/lib/commons-compress-1.3.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/dom4j-1.6.1.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/fontbox-1.6.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/jempbox-1.6.0.jar"/>
@@ -145,8 +145,8 @@
<classpathentry kind="lib" path="solr/contrib/extraction/lib/poi-scratchpad-3.8-beta4.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/rome-0.9.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/tagsoup-1.2.1.jar"/>
- <classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-0.10.jar"/>
- <classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-0.10.jar"/>
+ <classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-core-1.0.jar"/>
+ <classpathentry kind="lib" path="solr/contrib/extraction/lib/tika-parsers-1.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/extraction/lib/xmlbeans-2.3.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/langid/lib/jsonic-1.2.0.jar"/>
<classpathentry kind="lib" path="solr/contrib/langid/lib/langdetect-r111-java5.jar"/>
Modified: lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template (original)
+++ lucene/dev/branches/branch_3x/dev-tools/maven/pom.xml.template Wed Jan 25 14:50:01 2012
@@ -44,7 +44,7 @@
<jetty.version>6.1.26</jetty.version>
<patched.jetty.version>6.1.26-patched-JETTY-1340</patched.jetty.version>
<slf4j.version>1.6.1</slf4j.version>
- <tika.version>0.10</tika.version>
+ <tika.version>1.0</tika.version>
</properties>
<issueManagement>
<system>JIRA</system>
Modified: lucene/dev/branches/branch_3x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/CHANGES.txt?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/CHANGES.txt Wed Jan 25 14:50:01 2012
@@ -189,6 +189,8 @@ Other Changes
* SOLR-2718: Add ability to lazy load response writers, defined with startup="lazy".
(ehatcher)
+* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
+
Build
----------------------
* SOLR-2487: Add build target to package war without slf4j jars (janhoy)
Modified: lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/MailEntityProcessor.java Wed Jan 25 14:50:01 2012
@@ -18,8 +18,8 @@ package org.apache.solr.handler.dataimpo
import com.sun.mail.imap.IMAPMessage;
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.utils.ParseUtils;
+import org.apache.tika.Tika;
+import org.apache.tika.metadata.Metadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -94,6 +94,8 @@ public class MailEntityProcessor extends
rTimeout = getIntFromContext("readTimeout", 60 * 1000);
processAttachment = getBoolFromContext("processAttachement", true);
+ tika = new Tika();
+
logConfig();
}
@@ -165,7 +167,10 @@ public class MailEntityProcessor extends
if (!processAttachment || (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT))) return;
InputStream is = part.getInputStream();
String fileName = part.getFileName();
- String content = ParseUtils.getStringContent(is, TikaConfig.getDefaultConfig(), ctype.getBaseType().toLowerCase(Locale.ENGLISH));
+ Metadata md = new Metadata();
+ md.set(Metadata.CONTENT_TYPE, ctype.getBaseType().toLowerCase(Locale.ENGLISH));
+ md.set(Metadata.RESOURCE_NAME_KEY, fileName);
+ String content = tika.parseToString(is, md);
if (disp != null && disp.equalsIgnoreCase(Part.ATTACHMENT)) {
if (row.get(ATTACHMENT) == null)
row.put(ATTACHMENT, new ArrayList<String>());
@@ -528,6 +533,8 @@ public class MailEntityProcessor extends
private boolean processAttachment = true;
+ private Tika tika;
+
// holds the current state
private Store mailbox;
private boolean connected = false;
Modified: lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler-extras/src/java/org/apache/solr/handler/dataimport/TikaEntityProcessor.java Wed Jan 25 14:50:01 2012
@@ -118,9 +118,7 @@ public class TikaEntityProcessor extends
}
Parser tikaParser = null;
if(parser.equals(AUTO_PARSER)){
- AutoDetectParser parser = new AutoDetectParser();
- parser.setConfig(tikaConfig);
- tikaParser = parser;
+ tikaParser = new AutoDetectParser(tikaConfig);
} else {
tikaParser = (Parser) context.getSolrCore().getResourceLoader().newInstance(parser);
}
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/CHANGES.txt Wed Jan 25 14:50:01 2012
@@ -20,7 +20,7 @@ to your Solr Home lib directory. See ht
Tika Dependency
---------------
-Current Version: Tika 0.10 (released 2011-09-30)
+Current Version: Tika 1.0 (released 2011-11-07)
$Id$
@@ -30,6 +30,8 @@ $Id$
This is convenient when Tika's auto detector cannot detect encoding, especially
the text file is too short to detect encoding. (koji)
+* SOLR-2901: Upgrade Solr to Tika 1.0 (janhoy)
+
================== Release 3.5.0 ==================
* SOLR-2372: Upgrade Solr to Tika 0.10 (janhoy)
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/java/org/apache/solr/handler/extraction/ExtractingDocumentLoader.java Wed Jan 25 14:50:01 2012
@@ -39,6 +39,7 @@ import org.apache.tika.exception.TikaExc
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.DefaultParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
@@ -156,7 +157,7 @@ public class ExtractingDocumentLoader ex
if (streamType != null) {
//Cache? Parsers are lightweight to construct and thread-safe, so I'm told
MediaType mt = MediaType.parse(streamType.trim().toLowerCase());
- parser = config.getParser(mt);
+ parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
} else {
parser = autoDetectParser;
}
@@ -173,6 +174,10 @@ public class ExtractingDocumentLoader ex
if (resourceName != null) {
metadata.add(Metadata.RESOURCE_NAME_KEY, resourceName);
}
+ // Provide stream's content type as hint for auto detection
+ if(stream.getContentType() != null) {
+ metadata.add(Metadata.CONTENT_TYPE, stream.getContentType());
+ }
SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, schema);
InputStream inputStream = null;
Modified: lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/extraction/src/test/org/apache/solr/handler/extraction/ExtractingRequestHandlerTest.java Wed Jan 25 14:50:01 2012
@@ -18,7 +18,6 @@ package org.apache.solr.handler.extracti
import java.util.ArrayList;
import java.util.List;
-
import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.ContentStream;
@@ -419,7 +418,33 @@ public class ExtractingRequestHandlerTes
assertU(commit());
assertQ(req("*:*"), "//result[@numFound=1]");
}
+
+ @Test
+ public void testWrongStreamType() throws Exception {
+ ExtractingRequestHandler handler = (ExtractingRequestHandler) h.getCore().getRequestHandler("/update/extract");
+ assertTrue("handler is null and it shouldn't be", handler != null);
+
+ try{
+ // Load plain text specifying another mime type, should fail
+ loadLocal("extraction/version_control.txt",
+ "literal.id", "one",
+ ExtractingParams.STREAM_TYPE, "application/pdf"
+ );
+ fail("SolrException is expected because wrong parser specified for the file type");
+ }
+ catch(Exception expected){}
+ try{
+ // Load plain text specifying non existing mimetype, should fail
+ loadLocal("extraction/version_control.txt",
+ "literal.id", "one",
+ ExtractingParams.STREAM_TYPE, "foo/bar"
+ );
+ fail("SolrException is expected because nonexsisting parser specified");
+ }
+ catch(Exception expected){}
+ }
+
SolrQueryResponse loadLocal(String filename, String... args) throws Exception {
LocalSolrQueryRequest req = (LocalSolrQueryRequest) req(args);
try {
Modified: lucene/dev/branches/branch_3x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java?rev=1235777&r1=1235776&r2=1235777&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java (original)
+++ lucene/dev/branches/branch_3x/solr/contrib/langid/src/test/org/apache/solr/update/processor/LanguageIdentifierUpdateProcessorFactoryTestCase.java Wed Jan 25 14:50:01 2012
@@ -67,7 +67,7 @@ public abstract class LanguageIdentifier
assertLang("no", "id", "1no", "name", "Lucene", "subject", "Lucene er et fri/åpen kildekode programvarebibliotek for informasjonsgjenfinning, opprinnelig utviklet i programmeringsspråket Java av Doug Cutting. Lucene støttes av Apache Software Foundation og utgis under Apache-lisensen.");
assertLang("en", "id", "2en", "name", "Lucene", "subject", "Apache Lucene is a free/open source information retrieval software library, originally created in Java by Doug Cutting. It is supported by the Apache Software Foundation and is released under the Apache Software License.");
assertLang("sv", "id", "3sv", "name", "Maven", "subject", "Apache Maven är ett verktyg utvecklat av Apache Software Foundation och används inom systemutveckling av datorprogram i programspråket Java. Maven används för att automatiskt paketera (bygga) programfilerna till en distribuerbar enhet. Maven används inom samma område som Apache Ant men dess byggfiler är deklarativa till skillnad ifrån Ants skriptbaserade.");
- assertLang("es", "id", "4es", "name", "Lucene", "subject", "Lucene es un API de código abierto para recuperación de información, originalmente implementada en Java por Doug Cutting. Está apoyado por el Apache Software Foundation y se distribuye bajo la Apache Software License. Lucene tiene versiones para otros lenguajes incluyendo Delphi, Perl, C#, C++, Python, Ruby y PHP.");
+ assertLang("es", "id", "4es", "name", "Español", "subject", "El español, como las otras lenguas romances, es una continuación moderna del latÃn hablado (denominado latÃn vulgar), desde el siglo III, que tras el desmembramiento del Imperio romano fue divergiendo de las otras variantes del latÃn que se hablaban en las distintas provincias del antiguo Imperio, dando lugar mediante una lenta evolución a las distintas lenguas romances. Debido a su propagación por América, el español es, con diferencia, la lengua romance que ha logrado mayor difusión.");
assertLang("un", "id", "5un", "name", "a", "subject", "b");
assertLang("th", "id", "6th", "name", "à¸à¸à¸à¸§à¸²à¸¡à¸à¸±à¸à¸ªà¸£à¸£à¹à¸à¸·à¸à¸à¸à¸µà¹", "subject", "à¸à¸±à¸à¹à¸à¸à¸¥à¸µà¸ª มารี à¸à¸±à¸à¹à¸à¸ à¸à¸£à¸±à¸à¸à¹ หรืà¸à¸¡à¸±à¸à¸£à¸¹à¹à¸à¸±à¸à¹à¸à¸ าษาà¹à¸à¸¢à¸§à¹à¸² à¹à¸à¸à¸à¹ à¹à¸à¸£à¸à¸à¹ à¹à¸à¹à¸à¹à¸à¹à¸à¸«à¸à¸´à¸à¸à¸²à¸§à¸¢à¸´à¸§ à¹à¸à¸´à¸à¸à¸µà¹à¹à¸¡à¸·à¸à¸à¹à¸à¸£à¸à¸à¹à¹à¸à¸´à¸£à¹à¸ à¸à¸£à¸°à¹à¸à¸¨à¹à¸¢à¸à¸£à¸¡à¸à¸µ à¹à¸à¸à¸¡à¸µà¸à¸·à¹à¸à¹à¸ªà¸µà¸¢à¸à¹à¸à¹à�
�à¸à¸±à¸à¹à¸à¸à¸²à¸à¸°à¸à¸¹à¹à¹à¸à¸µà¸¢à¸à¸à¸±à¸à¸à¸¶à¸à¸à¸£à¸°à¸à¸³à¸§à¸±à¸à¸à¸¶à¹à¸à¸à¹à¸à¸¡à¸²à¹à¸à¹à¸£à¸±à¸à¸à¸²à¸£à¸à¸µà¸à¸´à¸¡à¸à¹à¹à¸à¹à¸à¸«à¸à¸±à¸à¸ªà¸·à¸ à¸à¸£à¸£à¸¢à¸²à¸¢à¹à¸«à¸à¸¸à¸à¸²à¸£à¸à¹à¸à¸à¸°à¸«à¸¥à¸à¸à¹à¸à¸à¸à¸±à¸§à¸à¸²à¸à¸à¸²à¸£à¸¥à¹à¸²à¸à¸²à¸§à¸¢à¸´à¸§à¹à¸à¸à¸£à¸°à¹à¸à¸¨à¹à¸à¹à¸à¸à¸£à¹à¹à¸¥à¸à¸à¹ ระหวà¹à¸²à¸à¸à¸µà¹à¸à¸¹à¸à¹à¸¢à¸à¸£à¸¡à¸à¸µà¹à¸à¹à¸²à¸à¸£à¸à¸à¸à¸£à¸à¸à¹à¸
à¸à¹à¸§à¸à¸ªà¸à¸à¸£à¸²à¸¡à¹à¸¥à¸à¸à¸£à¸±à¹à¸à¸à¸µà¹à¸ªà¸à¸");
assertLang("ru", "id", "7ru", "name", "Lucene", "subject", "The Apache Lucene â ÑÑо ÑÐ²Ð¾Ð±Ð¾Ð´Ð½Ð°Ñ Ð±Ð¸Ð±Ð»Ð¸Ð¾Ñека Ð´Ð»Ñ Ð²ÑÑокоÑкоÑоÑÑного полноÑекÑÑового поиÑка, напиÑÐ°Ð½Ð½Ð°Ñ Ð½Ð° Java. ÐÐ¾Ð¶ÐµÑ Ð±ÑÑÑ Ð¸ÑполÑзована Ð´Ð»Ñ Ð¿Ð¾Ð¸Ñка в инÑеÑнеÑе и дÑÑгиÑ
облаÑÑÑÑ
компÑÑÑеÑной лингвиÑÑики (аналиÑиÑеÑÐºÐ°Ñ ÑилоÑоÑиÑ).");
@@ -76,7 +76,17 @@ public abstract class LanguageIdentifier
assertLang("nl", "id", "10nl", "name", "Lucene", "subject", "Lucene is een gratis open source, tekst gebaseerde information retrieval API van origine geschreven in Java door Doug Cutting. Het wordt ondersteund door de Apache Software Foundation en is vrijgegeven onder de Apache Software Licentie. Lucene is ook beschikbaar in andere programeertalen zoals Perl, C#, C++, Python, Ruby en PHP.");
assertLang("it", "id", "11it", "name", "Lucene", "subject", "Lucene è una API gratuita ed open source per il reperimento di informazioni inizialmente implementata in Java da Doug Cutting. à supportata dall'Apache Software Foundation ed è resa disponibile con l'Apache License. Lucene è stata successivamente reimplementata in Perl, C#, C++, Python, Ruby e PHP.");
assertLang("pt", "id", "12pt", "name", "Lucene", "subject", "Apache Lucene, ou simplesmente Lucene, é um software de busca e uma API de indexação de documentos, escrito na linguagem de programação Java. à um software de código aberto da Apache Software Foundation licenciado através da licença Apache.");
+ // New in Tika1.0
+ assertLang("ca", "id", "13ca", "name", "Catalan", "subject", "El català posseeix dos està ndards principals: el regulat per l'Institut d'Estudis Catalans, o està ndard general, que pren com a base l'ortografia establerta per Pompeu Fabra amb els trets gramaticals i ortogrà fics caracterÃstics del català central; i el regulat per l'Acadèmia Valenciana de la Llengua, està ndard d'à mbit restringit, centrat en l'estandardització del valencià i que pren com a base les Normes de Castelló, és a dir, l'ortografia de Pompeu Fabra però més adaptada a la pronúncia del català occidental i als trets que caracteritzen els dialectes valencians.");
+ assertLang("be", "id", "14be", "name", "Belarusian", "subject", "ÐаÑÑÑпнай бÑйной дзÑÑжавай на белаÑÑÑкай зÑÐ¼Ð»Ñ Ð±Ñло ÐÑлÑкае кнÑÑÑва ÐÑÑоÑÑкае, Ð ÑÑкае Ñ ÐамойÑкае (ÐÐÐ). ÐадÑÐ°Ñ ÑÑваÑÑÐ½Ð½Ñ Ñ Ð¿Ð°ÑаÑковага ÑазвÑÑÑÑ Ð³ÑÑай дзÑÑÐ¶Ð°Ð²Ñ Ð½Ð°Ð¹Ð±ÑйнейÑÑм Ñ Ð°ÑноÑнÑм Ñе ÑÑнÑÑам бÑÑ ÐовагаÑодак. ÐкÑÐ°Ð¼Ñ ÑÑÑаÑнÑÑ
землÑÑ ÐелаÑÑÑÑ, Ñ Ñклад гÑÑай дÐ�
�ÑÑÐ¶Ð°Ð²Ñ ÑваÑ
одзÑÐ»Ñ ÑакÑама Ð·ÐµÐ¼Ð»Ñ ÑÑÑаÑнай ÐÑÑвÑ, паÑноÑÐ½Ð°Ñ ÑаÑÑка ÑÑÑаÑнай УкÑаÑÐ½Ñ Ñ ÑаÑÑка ÑÑÑаÑнай РаÑÑÑ.");
+ assertLang("eo", "id", "15eo", "name", "Esperanto", "subject", "La vortprovizo de Esperanto devenas plejparte el la okcidenteÅropaj lingvoj, dum Äia sintakso kaj morfologio montras ankaÅ slavlingvan influon. La morfemoj ne ÅanÄiÄas kaj oni povas ilin preskaÅ senlime kombini, kreante diverssignifajn vortojn, Esperanto do havas multajn kunaĵojn kun la analizaj lingvoj, al kiuj apartenas ekzemple la Äina; kontraÅe la interna strukturo de Esperanto certagrade respegulas la aglutinajn lingvojn, kiel la japanan, svahilan aÅ turkan.");
+ assertLang("gl", "id", "16gl", "name", "Galician", "subject", "A cifra de falantes medrou axiña durante as décadas seguintes, nun principio no Imperio ruso e na Europa oriental, logo na Europa occidental, América, China e no Xapón. Nos primeiros anos do movemento, os esperantistas mantiñan contacto por correspondencia, pero en 1905 o primeiro Congreso Universal de Esperanto levouse a cabo na cidade francesa de Boulogne-sur-Mer. Dende entón, os congresos mundiais organizáronse nos cinco continentes ano tras ano agás durante as dúas Guerras Mundiais.");
+ assertLang("ro", "id", "17ro", "name", "Romanian", "subject", "La momentul destrÄmÄrii Uniunii Sovietice Èi a înlÄturÄrii regimului comunist instalat în România (1989), Èara a iniÈiat o serie de reforme economice Èi politice. DupÄ un deceniu de probleme economice, România a introdus noi reforme economice de ordin general (precum cota unicÄ de impozitare, în 2005) Èi a aderat la Uniunea EuropeanÄ la 1 ianuarie 2007.");
+ assertLang("sk", "id", "18sk", "name", "Slovakian", "subject", "Boli vytvorené dva národné parlamenty - Äeská národná rada a Slovenská národná rada a spoloÄný jednokomorový Äesko-slovenský parlament bol premenovaný z Národného zhromaždenia na Federálne zhromaždenie s dvoma komorami - SnemovÅou ľudu a SnemovÅu národov.");
+ assertLang("sl", "id", "19sl", "name", "Slovenian", "subject", "Slovenska Wikipedija je razliÄica spletne enciklopedije Wikipedije v slovenskem jeziku. Projekt slovenske Wikipedije se je zaÄel 26. februarja 2002 z ustanovitvijo njene spletne strani, njen pobudnik pa je bil uporabnik Jani Melik.");
+ assertLang("uk", "id", "20uk", "name", "Ukrainian", "subject", "ÐаÑодно-гоÑподаÑÑÑкий ÐºÐ¾Ð¼Ð¿Ð»ÐµÐºÑ ÐºÑаÑни вклÑÑÐ°Ñ ÑÐ°ÐºÑ Ð²Ð¸Ð´Ð¸ пÑомиÑловоÑÑÑ Ñк важке маÑинобÑдÑваннÑ, ÑоÑна Ñа колÑоÑова меÑалÑÑгÑÑ, ÑÑднобÑдÑваннÑ, виÑобниÑÑво авÑобÑÑÑв, легковиÑ
Ñа ванÑажниÑ
авÑомобÑлÑв, ÑÑакÑоÑÑв Ñа ÑнÑÐ¾Ñ ÑÑлÑÑÑкогоÑподаÑÑÑÐºÐ¾Ñ ÑеÑ
нÑки, Ñепл
овозÑв, веÑÑÑаÑÑв, ÑÑÑбÑн, авÑаÑÑйниÑ
двигÑнÑв Ñа лÑÑакÑв, Ð¾Ð±Ð»Ð°Ð´Ð½Ð°Ð½Ð½Ñ Ð´Ð»Ñ ÐµÐ»ÐµÐºÑÑоÑÑанÑÑй, наÑÑо-Ð³Ð°Ð·Ð¾Ð²Ð¾Ñ Ñа Ñ
ÑмÑÑÐ½Ð¾Ñ Ð¿ÑомиÑловоÑÑÑ ÑоÑо. ÐÑÑм Ñого, УкÑаÑна Ñ Ð¿Ð¾ÑÑжним виÑобником елекÑÑоенеÑгÑÑ. УкÑаÑна Ð¼Ð°Ñ ÑозвинÑÑе ÑÑлÑÑÑке гоÑподаÑÑÑво Ñ Ð·Ð°Ð¹Ð¼Ð°Ñ Ð¾Ð´Ð½Ðµ з пÑовÑдниÑ
мÑÑÑÑ ÑеÑед екÑпÐ
¾ÑÑеÑÑв деÑкиÑ
видÑв ÑÑлÑÑÑкогоÑподаÑÑÑÐºÐ¾Ñ Ð¿ÑодÑкÑÑÑ Ñ Ð¿ÑодоволÑÑÑва (зокÑема, ÑонÑÑÐ½Ð¸ÐºÐ¾Ð²Ð¾Ñ Ð¾Ð»ÑÑ).");
}
+
@Test
public void testMapFieldName() throws Exception {