You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@jena.apache.org by an...@apache.org on 2013/06/21 14:50:31 UTC

svn commit: r1495423 - /jena/trunk/jena-text/src/main/java/jena/textindexer.java

Author: andy
Date: Fri Jun 21 12:50:30 2013
New Revision: 1495423

URL: http://svn.apache.org/r1495423
Log:
Rework command line handling and dataset construction.

Modified:
    jena/trunk/jena-text/src/main/java/jena/textindexer.java

Modified: jena/trunk/jena-text/src/main/java/jena/textindexer.java
URL: http://svn.apache.org/viewvc/jena/trunk/jena-text/src/main/java/jena/textindexer.java?rev=1495423&r1=1495422&r2=1495423&view=diff
==============================================================================
--- jena/trunk/jena-text/src/main/java/jena/textindexer.java (original)
+++ jena/trunk/jena-text/src/main/java/jena/textindexer.java Fri Jun 21 12:50:30 2013
@@ -16,200 +16,208 @@
  * limitations under the License.
  */
 
-package jena;
-
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.Set;
-
-import org.apache.jena.query.text.Entity;
-import org.apache.jena.query.text.EntityDefinition;
-import org.apache.jena.query.text.TextIndex;
-import org.apache.jena.query.text.TextQuery;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import arq.cmd.CmdException;
-import arq.cmdline.CmdARQ;
-import arq.cmdline.ModDataset;
-import arq.cmdline.ModDatasetAssembler;
-
-import com.hp.hpl.jena.graph.Node;
-import com.hp.hpl.jena.query.Dataset;
-import com.hp.hpl.jena.sparql.core.DatasetGraph;
-import com.hp.hpl.jena.sparql.core.Quad;
-import com.hp.hpl.jena.sparql.util.FmtUtils;
+package jena ;
 
+import java.util.HashSet ;
+import java.util.Iterator ;
+import java.util.Set ;
+
+import org.apache.jena.query.text.* ;
+import org.slf4j.Logger ;
+import org.slf4j.LoggerFactory ;
+import arq.cmd.CmdException ;
+import arq.cmdline.ArgDecl ;
+import arq.cmdline.CmdARQ ;
+
+import com.hp.hpl.jena.graph.Node ;
+import com.hp.hpl.jena.query.Dataset ;
+import com.hp.hpl.jena.sparql.core.Quad ;
+import com.hp.hpl.jena.sparql.util.FmtUtils ;
 
 /**
- * Text indexer application that will read a dataset and index its triples in its text index.
+ * Text indexer application that will read a dataset and index its triples in
+ * its text index.
  */
 public class textindexer extends CmdARQ {
-   
-    private static Logger log = LoggerFactory.getLogger(textindexer.class) ;
 
-    protected ModDataset       modDataset =  new ModDatasetAssembler() ;
-    protected Dataset          dataset = null;
-    protected TextIndex        textIndex = null;
-    protected EntityDefinition entityDefinition;
-    protected ProgressMonitor  progressMonitor;
+    private static Logger      log          = LoggerFactory.getLogger(textindexer.class) ;
+
+    public static final ArgDecl assemblerDescDecl = new ArgDecl(ArgDecl.HasValue, "desc", "dataset") ;
+    private String assemblerFile = null ;
+    
+    protected DatasetGraphText dataset      = null ;
+    protected TextIndex        textIndex    = null ;
+    protected EntityDefinition entityDefinition ;
+    protected ProgressMonitor  progressMonitor ;
 
-    static public void main(String... argv)
-    { 
+    static public void main(String... argv) {
+        TextQuery.init() ;
         new textindexer(argv).mainRun() ;
     }
-    
+
     static public void testMain(String... argv) {
-    	new textindexer(argv).mainMethod();
+        new textindexer(argv).mainMethod() ;
     }
-    
-    // @@ TODO
-    // check integrated properly with command line processing utilities
-    protected textindexer(String[] argv)
-    { 
+
+    protected textindexer(String[] argv) {
         super(argv) ;
-        super.addModule(modDataset);
-        progressMonitor = new ProgressMonitor("properties indexed");
-    }  
+        super.add(assemblerDescDecl, "--desc=", "Assembler description file") ;
+        progressMonitor = new ProgressMonitor("properties indexed") ;
+    }
 
     @Override
-    protected void processModulesAndArgs()
-    {
+    protected void processModulesAndArgs() {
         super.processModulesAndArgs() ;
-        dataset = modDataset.createDataset();
-        if (dataset == null)
-        	throw new CmdException("No dataset specified") ; 
-        textIndex = (TextIndex) dataset.getContext().get(TextQuery.textIndex);
-        if (textIndex == null) {
-        	throw new CmdException("Dataset has no text index");
-        } 
-        entityDefinition = textIndex.getDocDef();
+        // Two forms : with and without arg.
+        // Maximises similarity with other tools.
+        String file ;
+        if ( super.contains(assemblerDescDecl) ) {
+            if ( getValues(assemblerDescDecl).size() != 1 )
+                throw new CmdException("Multiple assembler descriptions given") ;
+            if ( getPositional().size() != 0 )
+                throw new CmdException("Additional assembler descriptions given") ; 
+            file = getValue(assemblerDescDecl) ;
+        } else {
+            if ( getNumPositional() != 1 )
+                throw new CmdException("Multiple assembler descriptions given") ;
+            file = getPositionalArg(0) ;
+        }
+        
+        if (file == null)
+            throw new CmdException("No dataset specified") ;
+        // Assumes a single test daatset description in the assembler file.
+        Dataset ds = TextDatasetFactory.create(file) ;
+        if (ds == null)
+            throw new CmdException("No dataset description found") ;
+        // get index.
+        dataset = (DatasetGraphText)(ds.asDatasetGraph()) ;
+        textIndex = dataset.getTextIndex() ;
+        if (textIndex == null)
+            throw new CmdException("Dataset has no text index") ;
+        entityDefinition = textIndex.getDocDef() ;
     }
-    
-	@Override
-	protected String getSummary() {
-		return getCommandName()+" [--desc | --dataset] assemblerPath" ;
-	}	
-	
-	@Override
-	protected void exec() {	
-		Set<Node> properties = getIndexedProperties();
-		DatasetGraph dsg = dataset.asDatasetGraph();
-		textIndex.startIndexing();
-		
-		// there are various strategies possible here
-		// what is implemented is a first cut simple approach
-		// currently - for each indexed property
-		//                list and index triples with that property
-		// that way only process triples that will be indexed
-		// but each entity may be updated several times
-		
-		for (Iterator<Node> propIter = properties.iterator(); propIter.hasNext() ; ) {
-		    Iterator<Quad> quadIter = dsg.find(Node.ANY, Node.ANY, propIter.next(), Node.ANY) ;
-		    for ( ; quadIter.hasNext(); ) {
-		    	Quad quad = quadIter.next();
-		    	Entity entity = createEntity(quad) ;
-		    	if (entity != null) {
-		    	    textIndex.addEntity(entity);
-		    	    progressMonitor.progressByOne();
-		    	}
-		    }
-		}
-		textIndex.finishIndexing();	
-	    progressMonitor.close();
-	}
-	
-	private Set<Node> getIndexedProperties() {
-		Set<Node> result = new HashSet<Node>();
-		for ( Iterator<String> iter = entityDefinition.fields().iterator(); iter.hasNext(); ) {
-			result.add(entityDefinition.getPredicate(iter.next()));			
-		}
-		return result;
-	}
-	
-	private Entity createEntity(Quad quad) {
-		Node s = quad.getSubject();
-		String x = (s.isURI() ) ? s.getURI() : s.getBlankNodeLabel() ;
-		Entity result = new Entity(x);
+
+    @Override
+    protected String getSummary() {
+        return getCommandName() + " assemblerFile" ;
+    }
+
+    @Override
+    protected void exec() {
+        Set<Node> properties = getIndexedProperties() ;
+        textIndex.startIndexing() ;
+
+        // there are various strategies possible here
+        // what is implemented is a first cut simple approach
+        // currently - for each indexed property
+        // list and index triples with that property
+        // that way only process triples that will be indexed
+        // but each entity may be updated several times
+
+        for (Iterator<Node> propIter = properties.iterator(); propIter.hasNext();) {
+            Node p = propIter.next() ;
+            Iterator<Quad> quadIter = dataset.find(Node.ANY, Node.ANY, p, Node.ANY) ;
+            for (; quadIter.hasNext();) {
+                Quad quad = quadIter.next() ;
+                Entity entity = createEntity(quad) ;
+                if (entity != null) {
+                    textIndex.addEntity(entity) ;
+                    progressMonitor.progressByOne() ;
+                }
+            }
+        }
+        textIndex.finishIndexing() ;
+        progressMonitor.close() ;
+    }
+
+    private Set<Node> getIndexedProperties() {
+        Set<Node> result = new HashSet<Node>() ;
+        for (Iterator<String> iter = entityDefinition.fields().iterator(); iter.hasNext();) {
+            result.add(entityDefinition.getPredicate(iter.next())) ;
+        }
+        return result ;
+    }
+
+    private Entity createEntity(Quad quad) {
+        Node s = quad.getSubject() ;
+        String x = (s.isURI()) ? s.getURI() : s.getBlankNodeLabel() ;
+        Entity result = new Entity(x) ;
         Node p = quad.getPredicate() ;
         String field = entityDefinition.getField(p) ;
-        if ( field == null )
+        if (field == null)
             return null ;
         Node o = quad.getObject() ;
         String val = null ;
-        if ( o.isURI() )
+        if (o.isURI())
             val = o.getURI() ;
-        else if ( o.isLiteral() )
-            val = o.getLiteralLexicalForm() ;
         else
-        {
-           log.warn("Not a literal value for mapped field-predicate: "+field+" :: "+FmtUtils.stringForString(field)) ;
-           return null;
-        }
+            if (o.isLiteral())
+                val = o.getLiteralLexicalForm() ;
+            else {
+                log.warn("Not a literal value for mapped field-predicate: " + field + " :: "
+                         + FmtUtils.stringForString(field)) ;
+                return null ;
+            }
         result.put(field, val) ;
-        return result;
+        return result ;
+    }
+
+    // TDBLoader has a similar progress monitor
+    // Not used here to avoid making ARQ dependent on TDB
+    // So potential to rationalise and put progress monitor in a common
+    // utility class
+    private static class ProgressMonitor {
+        String progressMessage ;
+        long   startTime ;
+        long   progressCount ;
+        long   intervalStartTime ;
+        long   progressAtStartOfInterval ;
+        long   reportingInterval = 10000 ; // milliseconds
+
+        ProgressMonitor(String progressMessage) {
+            this.progressMessage = progressMessage ;
+            start() ; // in case start not called
+        }
+
+        void start() {
+            startTime = System.currentTimeMillis() ;
+            progressCount = 0L ;
+            startInterval() ;
+        }
+
+        private void startInterval() {
+            intervalStartTime = System.currentTimeMillis() ;
+            progressAtStartOfInterval = progressCount ;
+        }
+
+        void progressByOne() {
+            progressCount++ ;
+            long now = System.currentTimeMillis() ;
+            if (reportDue(now)) {
+                report(now) ;
+                startInterval() ;
+            }
+        }
+
+        boolean reportDue(long now) {
+            return now - intervalStartTime >= reportingInterval ;
+        }
+
+        private void report(long now) {
+            long progressThisInterval = progressCount - progressAtStartOfInterval ;
+            long intervalDuration = now - intervalStartTime ;
+            long overallDuration = now - startTime ;
+            String message = progressCount + " (" + progressThisInterval / (intervalDuration / 1000) + " per second)"
+                             + progressMessage + " (" + progressCount / Math.max(overallDuration / 1000, 1)
+                             + " per second overall)" ;
+            log.info(message) ;
+        }
+
+        void close() {
+            long overallDuration = System.currentTimeMillis() - startTime ;
+            String message = progressCount + " (" + progressCount / Math.max(overallDuration / 1000, 1)
+                             + " per second)" + progressMessage ;
+            log.info(message) ;
+        }
     }
-	
-	// TDBLoader has a similar progress monitor
-	// Not used here to avoid making ARQ dependent on TDB
-	// So potential to rationalise and put progress monitor in a common
-	// utility class @@ TODO
-	private static class ProgressMonitor {
-		String progressMessage;
-		long startTime;
-		long progressCount;
-		long intervalStartTime;
-		long progressAtStartOfInterval;
-		long reportingInterval = 10000; // milliseconds
-		
-		ProgressMonitor(String progressMessage) {
-			this.progressMessage = progressMessage ;
-			start();  // in case start not called
-		}
-		
-		void start() {
-			startTime = System.currentTimeMillis();
-			progressCount = 0L;
-			startInterval();
-		}
-		
-		private void startInterval() {
-			intervalStartTime = System.currentTimeMillis();
-			progressAtStartOfInterval = progressCount;
-		}
-		
-		void progressByOne() {
-			progressCount++;
-			long now = System.currentTimeMillis();
-			if (reportDue(now)) {
-				report(now);
-				startInterval();
-			}
-		}
-		
-		boolean reportDue(long now) {
-			return now - intervalStartTime >= reportingInterval;
-		}
-		
-		private void report(long now) {
-			long progressThisInterval = progressCount - progressAtStartOfInterval;
-			long intervalDuration = now - intervalStartTime;
-			long overallDuration = now - startTime;
-			String message = 
-				progressCount +
-				" (" + progressThisInterval / (intervalDuration/1000) + " per second)" +
-				progressMessage +
-			    " (" + progressCount / Math.max(overallDuration /1000, 1) + " per second overall)";
-			log.info(message);		
-		}
-		
-		void close() {
-			long overallDuration = System.currentTimeMillis() - startTime;
-			String message =
-				progressCount +
-				" (" + progressCount / Math.max(overallDuration / 1000, 1) + " per second)" +
-				progressMessage;
-			log.info(message);
-		}
-	}
 }