You are viewing a plain text version of this content. The canonical link for it is here.
Posted to oak-commits@jackrabbit.apache.org by ca...@apache.org on 2018/05/25 10:41:28 UTC
svn commit: r1832231 - in /jackrabbit/oak/trunk/oak-run/src: main/java/org/apache/jackrabbit/oak/plugins/tika/ test/java/org/apache/jackrabbit/oak/plugins/tika/

Author: catholicon
Date: Fri May 25 10:41:28 2018
New Revision: 1832231

URL: http://svn.apache.org/viewvc?rev=1832231&view=rev
Log:
OAK-7353: oak-run tika extraction should support getting assistance from stored indexed data from a lucene index

Added:
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java   (with props)
    jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java   (with props)
Modified:
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
    jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java?rev=1832231&r1=1832230&r2=1832231&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextExtractorMain.java Fri May 25 10:41:28 2018
@@ -68,10 +68,12 @@ public class TextExtractorMain {
         try (Closer closer = Closer.create()) {
             boolean report = tikaOpts.report();
             boolean extract = tikaOpts.extract();
+            boolean populate = tikaOpts.populate();
             boolean generate = tikaOpts.generate();
-            BlobStore blobStore;
+            BlobStore blobStore = null;
             NodeStore nodeStore = null;
             File dataFile = tikaOpts.getDataFile();
+            File indexDir = tikaOpts.getIndexDir();
             File storeDir = tikaOpts.getStoreDir();
             File tikaConfigFile = tikaOpts.getTikaConfig();
             BinaryResourceProvider binaryResourceProvider = null;
@@ -92,19 +94,24 @@ public class TextExtractorMain {
 
             checkNotNull(dataFile, "Data file not configured with %s", tikaOpts.getDataFileSpecOpt());
 
-            if (!generate) {
+            if (report || extract) {
                 //For report and extract case we do not need NodeStore access so create BlobStore directly
                 BlobStoreFixture blobStoreFixture = BlobStoreFixtureProvider.create(opts);
                 closer.register(blobStoreFixture);
                 blobStore = checkNotNull(blobStoreFixture).getBlobStore();
-            } else {
+            } else if (generate) {
                 NodeStoreFixture nodeStoreFixture = NodeStoreFixtureProvider.create(opts);
                 closer.register(nodeStoreFixture);
                 blobStore = nodeStoreFixture.getBlobStore();
                 nodeStore = nodeStoreFixture.getStore();
             }
 
-            checkNotNull(blobStore, "This command requires an external BlobStore configured");
+            if (!populate) {
+                checkNotNull(blobStore, "This command requires an external BlobStore configured");
+            }
+
+            // NOTE: The order of executing generate, populate and extract is correct in case the user
+            // calls the tool with multiple actions in same run.
 
             if (generate){
                 checkNotNull(dataFile, "Data file path not provided");
@@ -114,6 +121,20 @@ public class TextExtractorMain {
                 generator.generate(brp.getBinaries(path));
             }
 
+            if (populate) {
+                checkArgument(dataFile.exists(),
+                        "Data file %s does not exist", dataFile.getAbsolutePath());
+                checkNotNull(indexDir, "Lucene index directory " +
+                        "must be specified via %s", tikaOpts.getIndexDirSpecOpt());
+                checkNotNull(storeDir, "Directory to store extracted text content " +
+                        "must be specified via %s", tikaOpts.getStoreDirSpecOpt());
+
+                DataStoreTextWriter writer = closer.register(new DataStoreTextWriter(storeDir, false));
+
+                TextPopulator textPopulator = new TextPopulator(writer);
+                textPopulator.populate(dataFile, indexDir);
+            }
+
             if (report || extract) {
                 checkArgument(dataFile.exists(),
                         "Data file %s does not exist", dataFile.getAbsolutePath());

Added: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java?rev=1832231&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java Fri May 25 10:41:28 2018
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import com.google.common.base.Stopwatch;
+import com.google.common.io.Closer;
+import org.apache.commons.csv.CSVParser;
+import org.apache.commons.csv.CSVRecord;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.FSDirectory;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.File;
+import java.io.IOException;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static org.apache.jackrabbit.JcrConstants.JCR_PATH;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.FULLTEXT;
+import static org.apache.jackrabbit.oak.plugins.index.lucene.FieldNames.PATH;
+import static org.apache.jackrabbit.oak.plugins.tika.CSVFileBinaryResourceProvider.FORMAT;
+
+class TextPopulator {
+    private static final Logger log = LoggerFactory.getLogger(TextPopulator.class);
+
+    static final String BLOB_ID = "blobId";
+    static final String ERROR_TEXT = "TextExtractionError";
+
+    private final TextWriter textWriter;
+
+    private PopulatorStats stats;
+
+    TextPopulator(TextWriter textWriter) {
+        this.textWriter = textWriter;
+        this.stats = new PopulatorStats();
+    }
+
+    // exposed for test purposes only
+    void setStats(PopulatorStats stats) {
+        this.stats = stats;
+    }
+
+    void populate(File dataFile, File indexDir) throws IOException {
+        try (Closer closer = Closer.create()) {
+            Iterable<CSVRecord> csvRecords = closer.register(CSVParser.parse(dataFile, UTF_8, FORMAT));
+
+            final FSDirectory dir = closer.register(FSDirectory.open(indexDir));
+            final DirectoryReader reader = closer.register(DirectoryReader.open(dir));
+            final IndexSearcher searcher = new IndexSearcher(reader);
+
+            for (CSVRecord record : csvRecords) {
+                String blobId = record.get(BLOB_ID);
+                String jcrPath = record.get(JCR_PATH);
+
+                if (!textWriter.isProcessed(blobId)) {
+                    String text = getText(reader, searcher, jcrPath);
+
+                    stats.processed++;
+
+                    if (text == null) {
+                        // Ignore errors as we might be processing partial OR incorrect index
+                        // writer.markError(blobId);
+                        stats.errored++;
+                    } else if (ERROR_TEXT.equals(text)) {
+                        textWriter.markError(blobId);
+                        stats.errored++;
+                    } else if (text.length() == 0) {
+                        textWriter.markEmpty(blobId);
+                        stats.empty++;
+                    } else {
+                        textWriter.write(blobId, text);
+                        stats.parsed++;
+                    }
+                } else {
+                    stats.ignored++;
+                }
+
+                stats.readAndDumpStatsIfRequired(jcrPath);
+            }
+            log.info(stats.toString());
+        }
+    }
+
+    private static String getText(DirectoryReader reader, IndexSearcher searcher, String path) {
+        TopDocs topDocs;
+        try {
+            topDocs = searcher.search(new TermQuery(new Term(PATH, path)), 1);
+
+            ScoreDoc[] scoreDocs = topDocs.scoreDocs;
+            if (scoreDocs.length != 1) {
+                return null;
+            }
+
+            Document doc = reader.document(scoreDocs[0].doc);
+
+            String[] ftVals = doc.getValues(FULLTEXT);
+            if (ftVals.length != 1) {
+                // being conservative... expecting only one stored fulltext field
+                return null;
+            }
+
+            return ftVals[0].trim();
+        } catch (IOException e) {
+            // ignore
+        }
+
+        return null;
+    }
+
+    static class PopulatorStats {
+        int read = 0;
+        int ignored = 0;
+        int processed = 0;
+        int parsed = 0;
+        int errored = 0;
+        int empty = 0;
+
+        Stopwatch w = Stopwatch.createStarted();
+
+        void readAndDumpStatsIfRequired(String path) {
+            read++;
+
+            if (read%10000 == 0) {
+                log.info("{} - currently at {}", this.toString(), path);
+            }
+        }
+
+        @Override
+        public String toString () {
+            return String.format("Text populator stats - " +
+                            "Read: %s; Ignored: %s; Processed: %s; Parsed: %s; Errored: %s; Empty: %s (in %s)",
+                    read, ignored, processed, parsed, errored, empty, w);
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java?rev=1832231&r1=1832230&r2=1832231&view=diff
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java (original)
+++ jackrabbit/oak/trunk/oak-run/src/main/java/org/apache/jackrabbit/oak/plugins/tika/TikaCommandOptions.java Fri May 25 10:41:28 2018
@@ -38,10 +38,12 @@ public class TikaCommandOptions implemen
     private final OptionSpec<File> dataFileSpecOpt;
     private final OptionSpec<File> tikaConfigSpecOpt;
     private final OptionSpec<File> storeDirSpecOpt;
+    private final OptionSpec<File> indexDirSpecOpt;
     private final OptionSpec<Integer> poolSizeOpt;
 
     private final OptionSpec<Void> reportAction;
     private final OptionSpec<Void> generateAction;
+    private final OptionSpec<Void> populateAction;
     private final OptionSpec<Void> extractAction;
 
     private final Set<String> operationNames;
@@ -71,6 +73,11 @@ public class TikaCommandOptions implemen
                 .withRequiredArg()
                 .ofType(File.class);
 
+        indexDirSpecOpt = parser
+                .accepts("index-dir", "Path of directory which stores lucene index containing extracted data")
+                .withRequiredArg()
+                .ofType(File.class);
+
         poolSizeOpt = parser
                 .accepts("pool-size", "Size of the thread pool used to perform text extraction. Defaults " +
                         "to number of cores on the system")
@@ -79,9 +86,10 @@ public class TikaCommandOptions implemen
 
         reportAction = parser.accepts("report", "Generates a summary report based on the csv file");
         generateAction = parser.accepts("generate", "Generates the CSV file required for 'extract' and 'report' actions");
+        populateAction = parser.accepts("populate", "Populates extraction store based on supplied indexed data and csv file");
         extractAction = parser.accepts("extract", "Performs the text extraction based on the csv file");
 
-        operationNames = ImmutableSet.of("report", "generate", "extract");
+        operationNames = ImmutableSet.of("report", "generate", "populate", "extract");
     }
 
     @Override
@@ -97,7 +105,7 @@ public class TikaCommandOptions implemen
     @Override
     public String description() {
         return "The tika command supports following operations. All operations connect to repository in read only mode. \n" +
-                "Use of one of the supported actions like --report, --generate, --extract etc. ";
+                "Use of one of the supported actions like --report, --generate, --populate, --extract etc. ";
     }
 
     @Override
@@ -126,6 +134,10 @@ public class TikaCommandOptions implemen
         return storeDirSpecOpt.value(options);
     }
 
+    public File getIndexDir() {
+        return indexDirSpecOpt.value(options);
+    }
+
     public boolean isPoolSizeDefined() {
         return options.has(poolSizeOpt);
     }
@@ -143,6 +155,10 @@ public class TikaCommandOptions implemen
         return options.has(generateAction) || hasNonOption("generate");
     }
 
+    public boolean populate() {
+        return options.has(populateAction) || hasNonOption("populate");
+    }
+
     public boolean extract() {
         return options.has(extractAction) || hasNonOption("extract");
     }
@@ -151,6 +167,10 @@ public class TikaCommandOptions implemen
         return dataFileSpecOpt;
     }
 
+    public OptionSpec<File> getIndexDirSpecOpt() {
+        return indexDirSpecOpt;
+    }
+
     public OptionSpec<File> getStoreDirSpecOpt() {
         return storeDirSpecOpt;
     }

Added: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java
URL: http://svn.apache.org/viewvc/jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java?rev=1832231&view=auto
==============================================================================
--- jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java (added)
+++ jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java Fri May 25 10:41:28 2018
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.jackrabbit.oak.plugins.tika;
+
+import com.beust.jcommander.internal.Maps;
+import com.google.common.collect.FluentIterable;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
+import com.google.common.io.ByteSource;
+import org.apache.jackrabbit.oak.plugins.blob.datastore.TextWriter;
+import org.apache.jackrabbit.oak.plugins.index.lucene.FieldFactory;
+import org.apache.jackrabbit.oak.plugins.index.lucene.OakAnalyzer;
+import org.apache.jackrabbit.oak.plugins.tika.TextPopulator.PopulatorStats;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.util.Version;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+import javax.annotation.Nonnull;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import static com.google.common.base.Charsets.UTF_8;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class TextPopulatorTest {
+    @Rule
+    public TemporaryFolder temporaryFolder = new TemporaryFolder();
+
+    private File indexDir = null;
+    private File csv = null;
+    private FakeTextWriter textWriter = new FakeTextWriter();
+    private PopulatorStats stats = new PopulatorStats();
+    private TextPopulator textPopulator = new TextPopulator(textWriter);
+
+    @Before
+    public void setup() throws Exception {
+        indexDir = temporaryFolder.newFolder("index-dump");
+        csv = temporaryFolder.newFile("blobs.csv");
+
+        textPopulator.setStats(stats);
+
+        setupIndexData();
+    }
+
+    private void setupIndexData() throws Exception {
+        Map<String, String> dataMap = Maps.newHashMap();
+        dataMap.put("/sentence", "some sentence.");
+        dataMap.put("/para", "some sentence.\nAnd more sentence after a new line");
+        dataMap.put("/error", TextPopulator.ERROR_TEXT);
+        dataMap.put("/null", null);
+        dataMap.put("/empty", "");
+        dataMap.put("/untrimmed-empty", " ");
+        dataMap.put("/untrimmed", " untrimmed ");
+
+        FSDirectory directory = FSDirectory.open(indexDir);
+        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, new OakAnalyzer(Version.LUCENE_47));
+        try (IndexWriter writer = new IndexWriter(directory, config)) {
+            for (Map.Entry<String, String> data : dataMap.entrySet()) {
+                writer.addDocument(createLuceneDocument(data.getKey(), data.getValue()));
+            }
+
+            // add document with multiple :fulltext
+            writer.addDocument(createLuceneDocument("/multi", "value1", "value2"));
+        }
+    }
+
+    private void setupCSV(String ... paths) throws IOException {
+        BinaryResourceProvider brp = new FakeBinaryResourceProvider(paths);
+        CSVFileGenerator generator = new CSVFileGenerator(csv);
+        generator.generate(brp.getBinaries("/"));
+    }
+
+    private List<Field> createLuceneDocument(@Nonnull String path, String ... values) {
+        List<Field> fields = Lists.newArrayList();
+        for (String value : values) {
+            if (value != null) {
+                fields.add(FieldFactory.newFulltextField(value, true));
+            }
+        }
+        fields.add(FieldFactory.newPathField(path));
+        return fields;
+    }
+
+    @Test
+    public void simpleTest() throws Exception {
+        setupCSV("/sentence", "/para");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Incorrect binaries processed", 2, stats.processed);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated call for already processed stuff shouldn't process anything more",
+                2, stats.ignored);
+
+        assertConsistentStatsAndWriter();
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void untrimmedText() throws Exception {
+        setupCSV("/untrimmed");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Store generation didn't trim data", "untrimmed",
+                textWriter.data.get(FakeBinaryResourceProvider.getBlobId("/untrimmed")));
+
+        assertConsistentStatsAndWriter();
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void indexedError() throws Exception {
+        setupCSV("/error");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Indexed data reporting errored extraction not marked as error",
+                1, stats.errored);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated run for indexed error shouldn't get processed again", 1, stats.ignored);
+
+        assertConsistentStatsAndWriter();
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void indexedEmpty() throws Exception {
+        setupCSV("/empty");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Indexed data for empty extraction not marked as empty",
+                1, stats.empty);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated run for empty extraction shouldn't get processed again", 1, stats.ignored);
+
+        assertConsistentStatsAndWriter();
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void indexedUntrimmedEmpty() throws Exception {
+        setupCSV("/untrimmed-empty");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Indexed data for untrimmed empty extraction not marked as empty",
+                1, stats.empty);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated run for untrimmed empty extraction shouldn't get processed again",
+                1, stats.ignored);
+
+        assertConsistentStatsAndWriter();
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void multiFTField() throws Exception {
+        setupCSV("/multi");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Multi FT field in a doc not marked as error",
+                1, stats.errored);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated run for multi FT error should get processed again", 0, stats.ignored);
+
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void indexHasDocumentButNotData() throws Exception {
+        setupCSV("/null");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("No FT field in a doc not marked as error",
+                1, stats.errored);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated run for no FT error should get processed again", 0, stats.ignored);
+
+        assertStatsInvariants();
+    }
+
+    @Test
+    public void indexDoesNotHaveDocument() throws Exception {
+        setupCSV("/somethingRandom");
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("No indexed doc not marked as error",
+                1, stats.errored);
+
+        textPopulator.populate(csv, indexDir);
+        assertEquals("Repeated run for no indexed doc error should get processed again", 0, stats.ignored);
+
+        assertStatsInvariants();
+    }
+
+    private void assertConsistentStatsAndWriter() {
+        assertEquals("Num blobs processed by text writer didn't process same not same as reported in stats",
+                textWriter.processed.size(), stats.processed);
+
+    }
+
+    private void assertStatsInvariants() {
+        assertTrue("Read (" + stats.read + ") !=" +
+                        " Processed (" + stats.processed + ") + Ignored (" + stats.ignored + ")",
+                stats.read == stats.processed + stats.ignored);
+
+        assertTrue("Processed (" + stats.processed + ") !=" +
+                        " Empty (" + stats.empty + ") + Errored (" + stats.errored + ") + Parsed (" + stats.parsed + ")",
+                stats.processed == stats.empty + stats.errored + stats.parsed);
+    }
+
+    private static class FakeTextWriter implements TextWriter {
+        final Set<String> processed = Sets.newHashSet();
+        final Map<String, String> data = Maps.newHashMap();
+
+        @Override
+        public void write(@Nonnull String blobId, @Nonnull String text) {
+            processed.add(blobId);
+            data.put(blobId, text);
+        }
+
+        @Override
+        public void markEmpty(String blobId) {
+            processed.add(blobId);
+        }
+
+        @Override
+        public void markError(String blobId) {
+            processed.add(blobId);
+        }
+
+        @Override
+        public boolean isProcessed(String blobId) {
+            return processed.contains(blobId);
+        }
+    }
+
+    private static class FakeBinaryResourceProvider implements BinaryResourceProvider {
+        private List<BinaryResource> binaries = Lists.newArrayList();
+
+        FakeBinaryResourceProvider(String ... paths) {
+            for (String path : paths) {
+                binaries.add(new BinaryResource(new StringByteSource(""), null, null, path, getBlobId(path)));
+            }
+        }
+
+        static String getBlobId(String path) {
+            return path + ":" + path;
+        }
+
+        @Override
+        public FluentIterable<BinaryResource> getBinaries(String path) {
+            return new FluentIterable<BinaryResource>() {
+                @Nonnull
+                @Override
+                public Iterator<BinaryResource> iterator() {
+                    return binaries.iterator();
+                }
+            };
+        }
+    }
+
+    private static class StringByteSource extends ByteSource {
+        private final String data;
+
+        StringByteSource(String data) {
+            this.data = data;
+        }
+
+        @Override
+        public InputStream openStream() {
+            return new ByteArrayInputStream(data.getBytes(UTF_8));
+        }
+    }
+}

Propchange: jackrabbit/oak/trunk/oak-run/src/test/java/org/apache/jackrabbit/oak/plugins/tika/TextPopulatorTest.java
------------------------------------------------------------------------------
    svn:eol-style = native