You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ca...@apache.org on 2018/05/25 10:41:28 UTC
svn commit: r1832231 - in /jackrabbit/oak/trunk/oak-run/src:
main/java/org/apache/jackrabbit/oak/plugins/tika/
test/java/org/apache/jackrabbit/oak/plugins/tika/
Author: catholicon
Date: Fri May 25 10:41:28 2018
New Revision: 1832231
URL: http://svn.apache.org/viewvc?rev=1832231&view=rev
Log:
OAK-7353: oak-run tika extraction should support getting assistance from stored indexed data from a lucene index
Added:
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java (with props)
jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java (with props)
Modified:
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1832231&r1=1832230&r2=1832231&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Fri May 25 10:41:28 2018
@@ -68,10 +68,12 @@ public class TextExtractorMain {
try (Closer closer = Closer.create()) {
boolean report = tikaOpts.report();
boolean extract = tikaOpts.extract();
+ boolean populate = tikaOpts.populate();
boolean generate = tikaOpts.generate();
- BlobStore blobStore;
+ BlobStore blobStore = null;
NodeStore nodeStore = null;
File dataFile = tikaOpts.getDataFile();
+ File indexDir = tikaOpts.getIndexDir();
File storeDir = tikaOpts.getStoreDir();
File tikaConfigFile = tikaOpts.getTikaConfig();
BinaryResourceProvider binaryResourceProvider = null;
@@ -92,19 +94,24 @@ public class TextExtractorMain {
checkNotNull(dataFile, "Data file not configured with %s", tikaOpts.getDataFileSpecOpt());
- if (!generate) {
+ if (report || extract) {
//For report and extract case we do not need NodeStore access so create BlobStore directly
BlobStoreFixture blobStoreFixture = BlobStoreFixtureProvider.create(opts);
closer.register(blobStoreFixture);
blobStore = checkNotNull(blobStoreFixture).getBlobStore();
- } else {
+ } else if (generate) {
NodeStoreFixture nodeStoreFixture = NodeStoreFixtureProvider.create(opts);
closer.register(nodeStoreFixture);
blobStore = nodeStoreFixture.getBlobStore();
nodeStore = nodeStoreFixture.getStore();
}
- checkNotNull(blobStore, "This command requires an external BlobStore configured");
+ if (!populate) {
+ checkNotNull(blobStore, "This command requires an external BlobStore configured");
+ }
+
+ // NOTE: The order of executing generate, populate and extract is correct in case the user
+ // calls the tool with multiple actions in same run.
if (generate){
checkNotNull(dataFile, "Data file path not provided");
@@ -114,6 +121,20 @@ public class TextExtractorMain {
generator.generate(brp.getBinaries(path));
}
+ if (populate) {
+ checkArgument(dataFile.exists(),
+ "Data file %s does not exist", dataFile.getAbsolutePath());
+ checkNotNull(indexDir, "Lucene index directory " +
+ "must be specified via %s", tikaOpts.getIndexDirSpecOpt());
+ checkNotNull(storeDir, "Directory to store extracted text content " +
+ "must be specified via %s", tikaOpts.getStoreDirSpecOpt());
+
+ DataStoreTextWriter writer = closer.register(new DataStoreTextWriter(storeDir, false));
+
+ TextPopulator textPopulator = new TextPopulator(writer);
+ textPopulator.populate(dataFile, indexDir);
+ }
+
if (report || extract) {
checkArgument(dataFile.exists(),
"Data file %s does not exist", dataFile.getAbsolutePath());
Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java?rev=1832231&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java Fri May 25 10:41:28 2018
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import com.google.common.base.Stopwatch;
+import com.google.common.io.Closer;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.FSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.FULLTEXT;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.PATH;
+import static org.apache.jackrabbit.oak.plugins.tika.CSVFileBinaryResourceProvider.FORMAT;
+
+class TextPopulator {
+ private static final Logger log = LoggerFactory.getLogger(TextPopulator.class);
+
+ static final String BLOB_ID = "blobId";
+ static final String ERROR_TEXT = "TextExtractionError";
+
+ private final TextWriter textWriter;
+
+ private PopulatorStats stats;
+
+ TextPopulator(TextWriter textWriter) {
+ this.textWriter = textWriter;
+ this.stats = new PopulatorStats();
+ }
+
+ // exposed for test purposes only
+ void setStats(PopulatorStats stats) {
+ this.stats = stats;
+ }
+
+ void populate(File dataFile, File indexDir) throws IOException {
+ try (Closer closer = Closer.create()) {
+ Iterable<CSVRecord> csvRecords = closer.register(CSVParser.parse(dataFile, UTF_8, FORMAT));
+
+ final FSDirectory dir = closer.register(FSDirectory.open(indexDir));
+ final DirectoryReader reader = closer.register(DirectoryReader.open(dir));
+ final IndexSearcher searcher = new IndexSearcher(reader);
+
+ for (CSVRecord record : csvRecords) {
+ String blobId = record.get(BLOB_ID);
+ String jcrPath = record.get(JCR_PATH);
+
+ if (!textWriter.isProcessed(blobId)) {
+ String text = getText(reader, searcher, jcrPath);
+
+ stats.processed++;
+
+ if (text == null) {
+ // Ignore errors as we might be processing partial OR incorrect index
+ // writer.markError(blobId);
+ stats.errored++;
+ } else if (ERROR_TEXT.equals(text)) {
+ textWriter.markError(blobId);
+ stats.errored++;
+ } else if (text.length() == 0) {
+ textWriter.markEmpty(blobId);
+ stats.empty++;
+ } else {
+ textWriter.write(blobId, text);
+ stats.parsed++;
+ }
+ } else {
+ stats.ignored++;
+ }
+
+ stats.readAndDumpStatsIfRequired(jcrPath);
+ }
+ log.info(stats.toString());
+ }
+ }
+
+ private static String getText(DirectoryReader reader, IndexSearcher searcher, String path) {
+ TopDocs topDocs;
+ try {
+ topDocs = searcher.search(new TermQuery(new Term(PATH, path)), 1);
+
+ ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+ if (scoreDocs.length != 1) {
+ return null;
+ }
+
+ Document doc = reader.document(scoreDocs[0].doc);
+
+ String[] ftVals = doc.getValues(FULLTEXT);
+ if (ftVals.length != 1) {
+ // being conservative... expecting only one stored fulltext field
+ return null;
+ }
+
+ return ftVals[0].trim();
+ } catch (IOException e) {
+ // ignore
+ }
+
+ return null;
+ }
+
+ static class PopulatorStats {
+ int read = 0;
+ int ignored = 0;
+ int processed = 0;
+ int parsed = 0;
+ int errored = 0;
+ int empty = 0;
+
+ Stopwatch w = Stopwatch.createStarted();
+
+ void readAndDumpStatsIfRequired(String path) {
+ read++;
+
+ if (read%10000 == 0) {
+ log.info("{} - currently at {}", this.toString(), path);
+ }
+ }
+
+ @Override
+ public String toString () {
+ return String.format("Text populator stats - " +
+ "Read: %s; Ignored: %s; Processed: %s; Parsed: %s; Errored: %s; Empty: %s (in %s)",
+ read, ignored, processed, parsed, errored, empty, w);
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java
------------------------------------------------------------------------------
svn:eol-style = native
Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java?rev=1832231&r1=1832230&r2=1832231&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java Fri May 25 10:41:28 2018
@@ -38,10 +38,12 @@ public class TikaCommandOptions implemen
private final OptionSpec<File> dataFileSpecOpt;
private final OptionSpec<File> tikaConfigSpecOpt;
private final OptionSpec<File> storeDirSpecOpt;
+ private final OptionSpec<File> indexDirSpecOpt;
private final OptionSpec<Integer> poolSizeOpt;
private final OptionSpec<Void> reportAction;
private final OptionSpec<Void> generateAction;
+ private final OptionSpec<Void> populateAction;
private final OptionSpec<Void> extractAction;
private final Set<String> operationNames;
@@ -71,6 +73,11 @@ public class TikaCommandOptions implemen
.withRequiredArg()
.ofType(File.class);
+ indexDirSpecOpt = parser
+ .accepts("index-dir", "Path of directory which stores lucene index containing extracted data")
+ .withRequiredArg()
+ .ofType(File.class);
+
poolSizeOpt = parser
.accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " +
"to number of cores on the system")
@@ -79,9 +86,10 @@ public class TikaCommandOptions implemen
reportAction = parser.accepts("report", "Generates a summary report based on the csv file");
generateAction = parser.accepts("generate", "Generates the CSV file required for 'extract' and 'report' actions");
+ populateAction = parser.accepts("populate", "Populates extraction store based on supplied indexed data and csv file");
extractAction = parser.accepts("extract", "Performs the text extraction based on the csv file");
- operationNames = ImmutableSet.of("report", "generate", "extract");
+ operationNames = ImmutableSet.of("report", "generate", "populate", "extract");
}
@Override
@@ -97,7 +105,7 @@ public class TikaCommandOptions implemen
@Override
public String description() {
return "The tika command supports following operations. All operations connect to repository in read only mode. \n" +
- "Use of one of the supported actions like --report, --generate, --extract etc. ";
+ "Use of one of the supported actions like --report, --generate, --populate, --extract etc. ";
}
@Override
@@ -126,6 +134,10 @@ public class TikaCommandOptions implemen
return storeDirSpecOpt.value(options);
}
+ public File getIndexDir() {
+ return indexDirSpecOpt.value(options);
+ }
+
public boolean isPoolSizeDefined() {
return options.has(poolSizeOpt);
}
@@ -143,6 +155,10 @@ public class TikaCommandOptions implemen
return options.has(generateAction) || hasNonOption("generate");
}
+ public boolean populate() {
+ return options.has(populateAction) || hasNonOption("populate");
+ }
+
public boolean extract() {
return options.has(extractAction) || hasNonOption("extract");
}
@@ -151,6 +167,10 @@ public class TikaCommandOptions implemen
return dataFileSpecOpt;
}
+ public OptionSpec<File> getIndexDirSpecOpt() {
+ return indexDirSpecOpt;
+ }
+
public OptionSpec<File> getStoreDirSpecOpt() {
return storeDirSpecOpt;
}
Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java?rev=1832231&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java Fri May 25 10:41:28 2018
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import com.beust.jcommander.internal.Maps;
+import com.google.common.collect.FluentIterable;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory;
+import org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer;
+import org.apache.jackrabbit.oak.plugins.tika.TextPopulator.PopulatorStats;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import javax.annotation.Nonnull;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TextPopulatorTest {
+ @Rule
+ public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+ private File indexDir = null;
+ private File csv = null;
+ private FakeTextWriter textWriter = new FakeTextWriter();
+ private PopulatorStats stats = new PopulatorStats();
+ private TextPopulator textPopulator = new TextPopulator(textWriter);
+
+ @Before
+ public void setup() throws Exception {
+ indexDir = temporaryFolder.newFolder("index-dump");
+ csv = temporaryFolder.newFile("blobs.csv");
+
+ textPopulator.setStats(stats);
+
+ setupIndexData();
+ }
+
+ private void setupIndexData() throws Exception {
+ Map<String, String> dataMap = Maps.newHashMap();
+ dataMap.put("/sentence", "some sentence.");
+ dataMap.put("/para", "some sentence.\nAnd more sentence after a new line");
+ dataMap.put("/error", TextPopulator.ERROR_TEXT);
+ dataMap.put("/null", null);
+ dataMap.put("/empty", "");
+ dataMap.put("/untrimmed-empty", " ");
+ dataMap.put("/untrimmed", " untrimmed ");
+
+ FSDirectory directory = FSDirectory.open(indexDir);
+ IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, new OakAnalyzer(Version.LUCENE_47));
+ try (IndexWriter writer = new IndexWriter(directory, config)) {
+ for (Map.Entry<String, String> data : dataMap.entrySet()) {
+ writer.addDocument(createLuceneDocument(data.getKey(), data.getValue()));
+ }
+
+ // add document with multiple :fulltext
+ writer.addDocument(createLuceneDocument("/multi", "value1", "value2"));
+ }
+ }
+
+ private void setupCSV(String ... paths) throws IOException {
+ BinaryResourceProvider brp = new FakeBinaryResourceProvider(paths);
+ CSVFileGenerator generator = new CSVFileGenerator(csv);
+ generator.generate(brp.getBinaries("/"));
+ }
+
+ private List<Field> createLuceneDocument(@Nonnull String path, String ... values) {
+ List<Field> fields = Lists.newArrayList();
+ for (String value : values) {
+ if (value != null) {
+ fields.add(FieldFactory.newFulltextField(value, true));
+ }
+ }
+ fields.add(FieldFactory.newPathField(path));
+ return fields;
+ }
+
+ @Test
+ public void simpleTest() throws Exception {
+ setupCSV("/sentence", "/para");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Incorrect binaries processed", 2, stats.processed);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated call for already processed stuff shouldn't process anything more",
+ 2, stats.ignored);
+
+ assertConsistentStatsAndWriter();
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void untrimmedText() throws Exception {
+ setupCSV("/untrimmed");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Store generation didn't trim data", "untrimmed",
+ textWriter.data.get(FakeBinaryResourceProvider.getBlobId("/untrimmed")));
+
+ assertConsistentStatsAndWriter();
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void indexedError() throws Exception {
+ setupCSV("/error");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Indexed data reporting errored extraction not marked as error",
+ 1, stats.errored);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated run for indexed error shouldn't get processed again", 1, stats.ignored);
+
+ assertConsistentStatsAndWriter();
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void indexedEmpty() throws Exception {
+ setupCSV("/empty");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Indexed data for empty extraction not marked as empty",
+ 1, stats.empty);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated run for empty extraction shouldn't get processed again", 1, stats.ignored);
+
+ assertConsistentStatsAndWriter();
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void indexedUntrimmedEmpty() throws Exception {
+ setupCSV("/untrimmed-empty");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Indexed data for untrimmed empty extraction not marked as empty",
+ 1, stats.empty);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated run for untrimmed empty extraction shouldn't get processed again",
+ 1, stats.ignored);
+
+ assertConsistentStatsAndWriter();
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void multiFTField() throws Exception {
+ setupCSV("/multi");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Multi FT field in a doc not marked as error",
+ 1, stats.errored);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated run for multi FT error should get processed again", 0, stats.ignored);
+
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void indexHasDocumentButNotData() throws Exception {
+ setupCSV("/null");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("No FT field in a doc not marked as error",
+ 1, stats.errored);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated run for no FT error should get processed again", 0, stats.ignored);
+
+ assertStatsInvariants();
+ }
+
+ @Test
+ public void indexDoesNotHaveDocument() throws Exception {
+ setupCSV("/somethingRandom");
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("No indexed doc not marked as error",
+ 1, stats.errored);
+
+ textPopulator.populate(csv, indexDir);
+ assertEquals("Repeated run for no indexed doc error should get processed again", 0, stats.ignored);
+
+ assertStatsInvariants();
+ }
+
+ private void assertConsistentStatsAndWriter() {
+ assertEquals("Num blobs processed by text writer didn't process same not same as reported in stats",
+ textWriter.processed.size(), stats.processed);
+
+ }
+
+ private void assertStatsInvariants() {
+ assertTrue("Read (" + stats.read + ") !=" +
+ " Processed (" + stats.processed + ") + Ignored (" + stats.ignored + ")",
+ stats.read == stats.processed + stats.ignored);
+
+ assertTrue("Processed (" + stats.processed + ") !=" +
+ " Empty (" + stats.empty + ") + Errored (" + stats.errored + ") + Parsed (" + stats.parsed + ")",
+ stats.processed == stats.empty + stats.errored + stats.parsed);
+ }
+
+ private static class FakeTextWriter implements TextWriter {
+ final Set<String> processed = Sets.newHashSet();
+ final Map<String, String> data = Maps.newHashMap();
+
+ @Override
+ public void write(@Nonnull String blobId, @Nonnull String text) {
+ processed.add(blobId);
+ data.put(blobId, text);
+ }
+
+ @Override
+ public void markEmpty(String blobId) {
+ processed.add(blobId);
+ }
+
+ @Override
+ public void markError(String blobId) {
+ processed.add(blobId);
+ }
+
+ @Override
+ public boolean isProcessed(String blobId) {
+ return processed.contains(blobId);
+ }
+ }
+
+ private static class FakeBinaryResourceProvider implements BinaryResourceProvider {
+ private List<BinaryResource> binaries = Lists.newArrayList();
+
+ FakeBinaryResourceProvider(String ... paths) {
+ for (String path : paths) {
+ binaries.add(new BinaryResource(new StringByteSource(""), null, null, path, getBlobId(path)));
+ }
+ }
+
+ static String getBlobId(String path) {
+ return path + ":" + path;
+ }
+
+ @Override
+ public FluentIterable<BinaryResource> getBinaries(String path) {
+ return new FluentIterable<BinaryResource>() {
+ @Nonnull
+ @Override
+ public Iterator<BinaryResource> iterator() {
+ return binaries.iterator();
+ }
+ };
+ }
+ }
+
+ private static class StringByteSource extends ByteSource {
+ private final String data;
+
+ StringByteSource(String data) {
+ this.data = data;
+ }
+
+ @Override
+ public InputStream openStream() {
+ return new ByteArrayInputStream(data.getBytes(UTF_8));
+ }
+ }
+}
Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java
------------------------------------------------------------------------------
svn:eol-style = native