You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by sf...@apache.org on 2010/12/12 16:13:37 UTC
svn commit: r1044832 [3/14] - in /incubator/stanbol/trunk/rick:
indexing/dbPedia/src/main/java/eu/iksproject/rick/indexing/dbPedia/cli/
indexing/genericRdf/src/main/java/eu/iksproject/rick/indexing/rdf/
indexing/geonames/src/main/java/eu/iksproject/ric...
Modified: incubator/stanbol/trunk/rick/indexing/geonames/src/main/java/eu/iksproject/rick/indexing/geonames/GeoNamesIndexer.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/rick/indexing/geonames/src/main/java/eu/iksproject/rick/indexing/geonames/GeoNamesIndexer.java?rev=1044832&r1=1044831&r2=1044832&view=diff
==============================================================================
--- incubator/stanbol/trunk/rick/indexing/geonames/src/main/java/eu/iksproject/rick/indexing/geonames/GeoNamesIndexer.java (original)
+++ incubator/stanbol/trunk/rick/indexing/geonames/src/main/java/eu/iksproject/rick/indexing/geonames/GeoNamesIndexer.java Sun Dec 12 15:13:35 2010
@@ -46,1006 +46,1006 @@ import eu.iksproject.rick.servicesapi.ya
import eu.iksproject.rick.servicesapi.yard.YardException;
public class GeoNamesIndexer {
-
- public static final String[] fieldMappings;
- static {
- ArrayList<String> mappings = new ArrayList<String>();
- mappings.add(Properties.gn_name.toString());
- //While indexing I use the UTF8 name as RDFS label (ASKII as fallback).
- //THis should be also the case for updated documents
- mappings.add(Properties.gn_name.toString()+" > "+Properties.rdfs_label.toString());
- mappings.add(Properties.gn_alternateName.toString());
- mappings.add(Properties.gn_countryCode.toString());
- mappings.add(Properties.gn_featureClass.toString());
- mappings.add(Properties.gn_featureCode.toString());
- mappings.add(Properties.gn_officialName.toString());
- //This cache copies the values of the sub-properties of parentFeature
- //to the super property. So we need to write the according mappings
- mappings.add(Properties.gn_parentADM1.toString());
- mappings.add(Properties.gn_parentADM1.toString()+" > "+Properties.gn_parentFeature.toString());
- mappings.add(Properties.gn_parentADM2.toString());
- mappings.add(Properties.gn_parentADM2.toString()+" > "+Properties.gn_parentFeature.toString());
- mappings.add(Properties.gn_parentADM3.toString());
- mappings.add(Properties.gn_parentADM3.toString()+" > "+Properties.gn_parentFeature.toString());
- mappings.add(Properties.gn_parentADM4.toString());
- mappings.add(Properties.gn_parentADM4.toString()+" > "+Properties.gn_parentFeature.toString());
- mappings.add(Properties.gn_parentCountry.toString());
- mappings.add(Properties.gn_parentCountry.toString()+" > "+Properties.gn_parentFeature.toString());
- mappings.add(Properties.gn_parentFeature.toString());
- //population is converted to long (NOTE: population of Asia > Integer.MAX_VALUE)
- mappings.add(Properties.gn_population.toString()+" | d=xsd:long");
- mappings.add(Properties.gn_postalCode.toString());
- mappings.add(Properties.gn_shortName.toString());
- mappings.add(Properties.gn_wikipediaArticle.toString());
- // Altitude is integer meters
- mappings.add(Properties.geo_alt.toString()+" | d=xsd:int");
- // Latitude and Longitude as BigDecimals (xsd:decimal)
- mappings.add(Properties.geo_lat.toString()+" | d=xsd:decimal");
- mappings.add(Properties.geo_long.toString()+" | d=xsd:decimal");
- mappings.add(Properties.rdf_type.toString());
- fieldMappings = mappings.toArray(new String[mappings.size()]);
- }
- Logger log = LoggerFactory.getLogger(GeoNamesIndexer.class);
-
- private Yard yard;
- private ValueFactory vf;
- private boolean indexOntology = false;
- private long startPosition;
- private int indexingChunkSize = 1000;
-
- private File dataDir;
- private File geonamesOntFile;
- private File alternateNamesFile;
- private File hierarchyFile;
- private List<File> adminCodesFiles;
- private File countryInfoFile;
- private final int countryGeonamesIdPos = 17;
- private File geonamesArchiveFile;
- private final String geonamesOntBase = "http://www.geonames.org/ontology/";
- private final String geonamesFeatureBase = "http://sws.geonames.org/";
- private final String geonamesCountryBase = "http://www.geonames.org/countries/";
- //for date processing we use joda time!
- private final Map<Integer,List<FeatureName>> featureNames = new TreeMap<Integer,List<FeatureName>>();
- private final Map<String, Integer> adminCode2featureId = new TreeMap<String, Integer>();
-
- private final Map<Integer,Collection<Integer>> parentFeature = new TreeMap<Integer, Collection<Integer>>();
- private final Map<Integer,Collection<Integer>> adminParentFeature = new TreeMap<Integer, Collection<Integer>>();
-
- private final Map<String, Integer> countryCode2featureId = new TreeMap<String, Integer>();
- /**
- * Key used to parse the Yard used for indexing
- */
- public static final String KEY_YARD = "eu.iksproject.rick.indexing.yard";
- /**
- * Used to parse the ID of the Item to start/resume the indexing
- */
- public static final String KEY_START_INDEX = "eu.iksproject.rick.indexing.startIndex";
- /**
- * State used to config if the geonames.org thesaurus should be included in the index.
- */
- public static final String KEY_INDEX_ONTOLOGY_STATE = "eu.iksproject.rick.indexing.geonames.indexOntology";
-
- /**
- * Key used to configure the directory that contains all the data needed
- * for indexing geonames.org
- */
- public static final String KEY_DATA_DIR = "eu.iksproject.rick.indexing.geonames.dataDir";
- /**
- * key used to parse the name of the zip archive with the geonames.org dump.
- * Typically the allcountry dump.
- */
- public static final String KEY_GEONAMES_ARCHIVE = "eu.iksproject.rick.indexing.geonames.dbdumpArchive";
- /**
- * Key used to parse the name of the file with the country informations
- */
- public static final String KEY_COUNTRY_INFOS = "eu.iksproject.rick.indexing.geonames.countryInfoFile";
- /**
- * Key used to parse the name of the file with the admin level1 codes
- */
- public static final String KEY_ADMIN1_CODES = "eu.iksproject.rick.indexing.geonames.admin1CodesFile";
- /**
- * Key used to parse the name of the file with the admin level2 codes
- */
- public static final String KEY_ADMIN2_CODES = "eu.iksproject.rick.indexing.geonames.admin2CodesFile";
- /**
- * Key used to parse the name of the file with the alternate names
- */
- public static final String KEY_ALTERNATE_NAMES = "eu.iksproject.rick.indexing.geonames.alternateNamesFile";
- /**
- * Key used to parse the name of the file with the geonames ontology
- */
- public static final String KEY_GEONAMES_ONTOLOGY = "eu.iksproject.rick.indexing.geonames.geonamesOntologyFile";
-
- public static final String KEY_CHUNK_SIZE = "eu.iksproject.rick.indexing.geonames.chunkSize";
- /**
- * Key used to parse the hierarchy file
- */
- public static final String KEY_HIERARCHY = "eu.iksproject.rick.indexing.geonames.hierarchyFile";
-
- private final static Map<String,Reference> indexDocRefs = new HashMap<String, Reference>();
-
- private static enum Properties{
- rdf_type(NamespaceEnum.rdf.getNamespace(),"type"),
- rdfs_label(NamespaceEnum.rdfs.getNamespace(),"label"),
- dc_creator(NamespaceEnum.dcTerms.getNamespace(),"creator"),
- dc_date(NamespaceEnum.dcTerms.getNamespace(),"date"),
- gn_Feature(NamespaceEnum.geonames.getNamespace(),"Feature"),
- //gn_Country(NamespaceEnum.geonames.getNamespace(),"Country"),
- gn_countryCode(NamespaceEnum.geonames.getNamespace(),"countryCode"),
- //gn_Map(NamespaceEnum.geonames.getNamespace(),"Map"),
- //gn_RDFData(NamespaceEnum.geonames.getNamespace(),"RDFData"),
- //gn_WikipediaArticle(NamespaceEnum.geonames.getNamespace(),"WikipediaArticle"),
- gn_parentFeature(NamespaceEnum.geonames.getNamespace(),"parentFeature"),
- gn_parentCountry(NamespaceEnum.geonames.getNamespace(),"parentCountry"),
- gn_parentADM1(NamespaceEnum.geonames.getNamespace(),"parentADM1"),
- gn_parentADM2(NamespaceEnum.geonames.getNamespace(),"parentADM2"),
- gn_parentADM3(NamespaceEnum.geonames.getNamespace(),"parentADM3"),
- gn_parentADM4(NamespaceEnum.geonames.getNamespace(),"parentADM4"),
- //gn_childrenFeatures(NamespaceEnum.geonames.getNamespace(),"childrenFeatures"),
- //gn_inCountry(NamespaceEnum.geonames.getNamespace(),"inCountry"),
- //gn_locatedIn(NamespaceEnum.geonames.getNamespace(),"locatedIn"),
- //gn_locationMap(NamespaceEnum.geonames.getNamespace(),"locationMap"),
- //gn_nearby(NamespaceEnum.geonames.getNamespace(),"nearby"),
- //gn_nearbyFeatures(NamespaceEnum.geonames.getNamespace(),"nearbyFeatures"),
- //gn_neighbour(NamespaceEnum.geonames.getNamespace(),"neighbour"),
- //gn_neighbouringFeatures(NamespaceEnum.geonames.getNamespace(),"neighbouringFeatures"),
- gn_wikipediaArticle(NamespaceEnum.geonames.getNamespace(),"wikipediaArticle"),
- gn_featureClass(NamespaceEnum.geonames.getNamespace(),"featureClass"),
- gn_featureCode(NamespaceEnum.geonames.getNamespace(),"featureCode"),
- //gn_tag(NamespaceEnum.geonames.getNamespace(),"tag"),
- gn_alternateName(NamespaceEnum.geonames.getNamespace(),"alternateName"),
- gn_officialName(NamespaceEnum.geonames.getNamespace(),"officialName"),
- gn_name(NamespaceEnum.geonames.getNamespace(),"name"),
- gn_population(NamespaceEnum.geonames.getNamespace(),"population"),
- gn_shortName(NamespaceEnum.geonames.getNamespace(),"shortName"),
- gn_postalCode(NamespaceEnum.geonames.getNamespace(),"postalCode"),
- geo_lat(NamespaceEnum.geo.getNamespace(),"lat"),
- geo_long(NamespaceEnum.geo.getNamespace(),"long"),
- geo_alt(NamespaceEnum.geo.getNamespace(),"alt"),
- skos_notation(NamespaceEnum.skos.getNamespace(),"notation"),
- skos_prefLabel(NamespaceEnum.skos.getNamespace(),"prefLabel"),
- skos_altLabel(NamespaceEnum.skos.getNamespace(),"altLabel"),
- skos_hiddenLabel(NamespaceEnum.skos.getNamespace(),"hiddenLabel"),
- skos_note(NamespaceEnum.skos.getNamespace(),"note"),
- skos_changeNote(NamespaceEnum.skos.getNamespace(),"changeNote"),
- skos_definition(NamespaceEnum.skos.getNamespace(),"definition"),
- skos_editorialNote(NamespaceEnum.skos.getNamespace(),"editorialNote"),
- skos_example(NamespaceEnum.skos.getNamespace(),"example"),
- skos_historyNote(NamespaceEnum.skos.getNamespace(),"historyNote"),
- skos_scopeNote(NamespaceEnum.skos.getNamespace(),"scopeNote"),
- skos_broader(NamespaceEnum.skos.getNamespace(),"broader"),
- skos_narrower(NamespaceEnum.skos.getNamespace(),"narrower"),
- skos_related(NamespaceEnum.skos.getNamespace(),"related"),
- ;
- String uri;
- Properties(String namespace,String name){
- uri = namespace+name;
- }
- @Override
- public String toString() {
- return uri;
- }
- }
- public GeoNamesIndexer(Dictionary<String, Object> config) throws IllegalArgumentException {
- this.yard = (Yard)config.get(KEY_YARD);
- if(yard == null){
- throw new IllegalArgumentException("Parsed config MUST CONTAIN a Yard. Use the key "+KEY_YARD+" to parse the YardInstance used to store the geonames.org index!");
- } else {
- log.info(String.format("Using Yard %s (id=%s) to index geonames.org",
- yard.getName(),yard.getId()));
- }
- this.vf = yard.getValueFactory();
- Long startIndex = (Long)config.get(KEY_START_INDEX);
- if(startIndex != null && startIndex > 0l){
- this.startPosition = startIndex;
- } else {
- this.startPosition = 0;
- }
- Integer chunkSize = (Integer)config.get(KEY_CHUNK_SIZE);
- if(chunkSize != null && chunkSize>0){
- this.indexingChunkSize = chunkSize;
- } //else use default value of 1000
- log.info(" ... start indexing at position "+startPosition);
- Boolean indexOntology = (Boolean)config.get(KEY_INDEX_ONTOLOGY_STATE);
- if(indexOntology != null){
- this.indexOntology = indexOntology;
- } else {
- this.indexOntology = false;
- }
- log.info(" ... indexing geonames.org thesaurus="+indexOntology);
- this.dataDir = checkFile(KEY_DATA_DIR, config, "/data");
- this.geonamesArchiveFile = checkFile(KEY_GEONAMES_ARCHIVE, dataDir, config,"allCountries.zip");
- this.countryInfoFile = checkFile(KEY_COUNTRY_INFOS, dataDir,config,"countryInfo.txt");
- this.adminCodesFiles = new ArrayList<File>();
- adminCodesFiles.add(checkFile(KEY_ADMIN1_CODES, dataDir, config,"admin1CodesASCII.txt"));
- adminCodesFiles.add(checkFile(KEY_ADMIN2_CODES, dataDir, config,"admin2Codes.txt"));
- if(this.indexOntology){
- this.geonamesOntFile = checkFile(KEY_GEONAMES_ONTOLOGY, dataDir, config,"ontology_v2.2.1.rdf");
- }
- this.hierarchyFile = checkFile(KEY_HIERARCHY, dataDir, config, "hierarchy.zip");
- this.alternateNamesFile = checkFile(KEY_ALTERNATE_NAMES, dataDir, config,"alternateNames.zip");
- }
- /**
- * Create the index based on the parsed configuration
- * @throws IOException On any error while reading one of the configuration files
- * @throws YardException On any error while storing index features within the Yard
- */
- public void index() throws IOException, YardException{
- readAdminCodes();
- readHierarchy();
- readAlternateNames();
- indexGeonames();
- writeCacheBaseConfiguration();
- }
- /**
- * As the last step we need to create the baseMappings configuration
- * needed to used the Index as RICK full cache!
- * @throws YardException would be really bad if after successfully indexing
- * about 8 millions of documents we get an error from the yard at the
- * last possible opportunity :(
- */
- private void writeCacheBaseConfiguration() throws YardException {
- FieldMapper baseMapper = new DefaultFieldMapperImpl(ValueConverterFactory.getInstance(vf));
- log.info("Write BaseMappings for geonames.org Cache");
- log.info(" > Mappings");
- for(String mapping : GeoNamesIndexer.fieldMappings){
- log.info(" - "+mapping);
- baseMapper.addMapping(FieldMappingUtils.parseFieldMapping(mapping));
- }
- CacheUtils.storeBaseMappingsConfiguration(yard, baseMapper);
- log.info(" < completed");
- }
- /**
- * @param config
- */
- private File checkFile(String key,Dictionary<String, Object> config,Object defaultValue) {
- return checkFile(key, null,config, defaultValue);
- }
- private File checkFile(String key,File directory,Dictionary<String, Object> config,Object defaultValue) {
- File testFile;
- Object fileName = config.get(key);
- if(fileName == null){
- if(defaultValue == null){
- throw new IllegalArgumentException("Parsed Config MUST CONTAIN the a reference to the file for key "+key+"!");
- } else {
- fileName = defaultValue;
- }
- }
- if(directory == null){
- testFile = new File(fileName.toString());
- } else {
- testFile = new File(dataDir,fileName.toString());
- }
- if(!testFile.exists()){
- throw new IllegalStateException("File "+fileName+" parsed by key "+key+" does not exist!");
- }
- if(directory == null && !testFile.isDirectory()){
- throw new IllegalStateException("parsed data directory "+fileName+" exists, but is not a directory!");
- }
- if(directory != null && !testFile.isFile()){
- throw new IllegalStateException("parsed data file "+fileName+" exists, but is not a file!");
- }
- if(!testFile.canRead()){
- throw new IllegalStateException("Unable to read File "+fileName+" parsed for key "+key+"!");
- }
- return testFile;
- }
-
- private void indexGeonames() throws YardException, IOException {
- ZipFile geonamesZipFile;
- try {
- geonamesZipFile = new ZipFile(geonamesArchiveFile);
- } catch (IOException e) {
- //in the init we check if this is a file, exists and we can read ...
- // .. so throw a runtime exception here!
- throw new IllegalArgumentException("Unable to access geonames.org DB Dump file",e);
- }
- for(Enumeration<? extends ZipEntry> e = geonamesZipFile.entries();e.hasMoreElements();){
- ZipEntry entry = e.nextElement();
- if(!entry.isDirectory() && !entry.getName().toLowerCase().startsWith("readme")){
- log.info("add Entry "+entry.getName());
- BufferedReader reader = new BufferedReader(new InputStreamReader(geonamesZipFile.getInputStream(entry), Charset.forName("utf-8")));
- String line;
- int pos = 0;
- int blockPos =0;
- List<Representation> currentBlock = new ArrayList<Representation>(indexingChunkSize);
- long start = System.currentTimeMillis();
- long iStart = start;
- while((line = reader.readLine())!=null){
- pos++;
- if(pos>=startPosition){
- try {
- Representation indexedFeature = importFeature(line);
- //log.info(ModelUtils.getRepresentationInfo(indexedFeature));
- blockPos++;
- currentBlock.add(indexedFeature);
- if(blockPos == indexingChunkSize){
- yard.store(currentBlock);
- currentBlock.clear();
- blockPos = 0;
- }
- } catch (RuntimeException e1){
- log.warn("Exception while processing line "+line,e1);
- throw e1;
- } catch (YardException e1){
- log.warn("YardException while processing lines "+(pos-blockPos)+"-"+(pos),e1);
- throw e1;
- }
- if(pos%10000==0){
- long now = System.currentTimeMillis();
- float mean = ((float)(now-start))/(pos-startPosition);
- float iMean = ((float)(now-iStart))/10000;
- log.info(pos+" features processed ("+mean+"ms/feature; "+iMean+"ms/feature for the last 10000 features");
- iStart=System.currentTimeMillis();
- }
- } else {
- //remove alternate labels from the inMemoryMap for the ID to save memory
- Integer id = new Integer(line.substring(0, line.indexOf('\t')));
- featureNames.remove(id);
- }
- }
- //indexing the remaining documents
- yard.store(currentBlock);
- currentBlock.clear();
- blockPos = 0;
- //the final commit
- long now = System.currentTimeMillis();
- float mean = ((float)(now-start))/(pos-startPosition);
- log.info(pos+" features processed ("+mean+"ms/feature)");
- }
- }
- }
- private Reference getDocRef(String refString){
- Reference ref = indexDocRefs.get(refString);
- if(ref == null){
- ref = yard.getValueFactory().createReference(refString);
- indexDocRefs.put(refString, ref);
- }
- return ref;
- }
- private Collection<Reference> getFeatureReferences(Collection<Integer> ids){
- List<Reference> refs = new ArrayList<Reference>(ids.size());
- for(Integer id : ids){
- if(id != null){
- refs.add(vf.createReference(String.format("%s%s/", geonamesFeatureBase,id)));
- }
- }
- return refs;
- }
- private Representation importFeature(String line){
- Tokenizer t = new Tokenizer(line);
- String id = t.nextElement();
- Integer geoNamesId = Integer.parseInt(id);
- //create a new Doc based on the first Element (geonamesID)
- Representation doc = this.yard.create(String.format("%s%s/", geonamesFeatureBase,id));
- //add the geonames:Feature type
- doc.add(Properties.rdf_type.toString(), getDocRef(Properties.gn_Feature.toString()));
- //add the UTF-8name
- String utf8Label = t.nextElement();
- doc.addNaturalText(Properties.gn_name.toString(),utf8Label);
- //add the ASKII Name as rdfs:label
- String askiiLabel = t.nextElement();
- if(utf8Label == null){
- utf8Label = askiiLabel; //use ASKII label as fallback for the utf8 version
- }
- doc.addNaturalText(Properties.rdfs_label.toString(),utf8Label);
- //alternate Names (alternate names also include Airport codes, postal codes and Wikipedia links!
- t.nextElement(); //consume this Element and use the alternateNames Map instead
- List<FeatureName> alternateNames = featureNames.remove(geoNamesId); //use remove, because we need not need it a 2nd time!
- if(alternateNames != null){
- List<Text> altList = new ArrayList<Text>(alternateNames.size());
- List<Text> officialList = new ArrayList<Text>(alternateNames.size());
- List<String> postalCodes = new ArrayList<String>();
- List<URL> wikipediaLinks = new ArrayList<URL>();
- List<Text> shortNames = new ArrayList<Text>();
- for(FeatureName name : alternateNames){
- if(name.isNaturalLanguageLabel()){
- Text act = vf.createText(name.getName(),name.getLang());
- if(name.isPreferred()){
- officialList.add(act);
- } else {
- altList.add(act);
- }
- if(name.isShortName()){
- shortNames.add(act);
- }
- } else if(name.getLabelType() == NameType.postal){
- postalCodes.add(name.getName());
- } else if(name.getLabelType() == NameType.link){
- if(name.getName().contains("wikipedia.org")){
- try {
- wikipediaLinks.add(new URL(name.getName()));
- } catch (MalformedURLException e) {
- log.warn("Unable to parse URL for link label "+name.getName());
- //ignore
- }
- }
- }
- }
- if(!altList.isEmpty()){
- doc.add(Properties.gn_alternateName.toString(),altList);
- }
- if(!officialList.isEmpty()){
- doc.add(Properties.gn_officialName.toString(),officialList);
- }
- if(!postalCodes.isEmpty()){
- doc.add(Properties.gn_postalCode.toString(), postalCodes);
- }
- if(!wikipediaLinks.isEmpty()){
- doc.add(Properties.gn_wikipediaArticle.toString(), wikipediaLinks);
- }
- if(!shortNames.isEmpty()){
- doc.add(Properties.gn_shortName.toString(), shortNames);
- }
- }
- //lat
- doc.add(Properties.geo_lat.toString(),new BigDecimal(t.nextElement()));
- //lon
- doc.add(Properties.geo_long.toString(),new BigDecimal(t.nextElement()));
- //featureClass
- String featureClass = String.format("%s%s",NamespaceEnum.geonames,t.nextElement());
- doc.add(Properties.gn_featureClass.toString(),getDocRef(featureClass));
- //featureCode (-> need to use <featureClass>.<featureCode>!!)
- doc.add(Properties.gn_featureCode.toString(),getDocRef(String.format("%s.%s",featureClass,t.nextElement())));
- //countryCode
- // -> geonames uses here the link to an HTML Page showing the Country
- // We would like to use an Link to a SKOS:Concept representing the Country
- // ... But luckily here we need only to add the URI!
- Set<String> ccs = new HashSet<String>();
- String countryCode = t.nextElement();
- if(countryCode != null){
- countryCode = countryCode.trim(); //need to trim because some country codes use ' ' to indicate null!
- if(countryCode.length() == 2){ //Yes there are some features that are in no country!
- ccs.add(countryCode);
- }
- }
- //alternate countryCodes
- String altCc = t.nextElement();
- if(altCc != null){
- StringTokenizer altCcT = new StringTokenizer(altCc,",");
- while(altCcT.hasMoreElements()){
- countryCode = altCcT.nextToken();
- if(countryCode.length() ==2){
- ccs.add(countryCode);
- }
- }
- }
- if(!ccs.isEmpty()){
- doc.add(Properties.gn_countryCode.toString(),ccs);
- }
- //admin Codes 1-4
- //first read them -> we need to consume the tokens anyway
- String[] adminCodes = new String[] {
- countryCode, //country
- t.nextElement(), //ADM1
- t.nextElement(), //ADM2
- t.nextElement(), //ADM3
- t.nextElement()};//ADM4
- //Workaround for Admin1 -> add leading '0' for single Value
- if(adminCodes[1] != null && adminCodes[1].length() < 2){
- adminCodes[1] = '0'+adminCodes[1];
- }
- addParents(doc,geoNamesId,adminCodes);
-
- //population
- String populationString = t.nextElement();
- if(populationString != null){
- //NOTE: we need to used Long, because of Asia (3.800.000)
- Long population = new Long(populationString);
- if(population.intValue() > 0){
- doc.add(Properties.gn_population.toString(),population);
- }
- }
- //elevation
- String latString = t.nextElement();
- if(latString == null){
- latString = t.nextElement(); //if no elevation than use the gtopo30
- } else {
- t.nextElement(); //if there is already en elevation, than consume these entry
- }
- Integer alt = new Integer(latString);
- if(alt.intValue() > -9999){ //it looks like that -9999 is sometimes used as not known!
- doc.add(Properties.geo_alt.toString(),alt);
- }
- //time zone
- t.nextElement(); //not used
- //mod-date
- String modDateString = t.nextElement();
- if(modDateString != null){
- try {
- doc.add(Properties.dc_date.toString(),TimeUtils.toDate(DataTypeEnum.DateTime, modDateString));
- }catch (IllegalArgumentException e) {
- log.warn(String.format("Unable to parse modificationDate for geonamesID %s from value %s",doc.getId(),modDateString));
- }
- }
- //and add geonames.org as the creator!
- doc.add(Properties.dc_creator.toString(),"http://www.geonames.org/");
- return doc;
- }
-
- private void addParents(Representation doc,Integer id,String[] adminCodes){
- Integer[] adminIds = new Integer[5];
- //now process the admin Codes (including the country at index 0)
- for(int i=0;i<adminCodes.length;i++){
- if(adminCodes[i] != null && !adminCodes[i].equals("00")){ //00 is used to indicate not known
- StringBuilder parentCode = new StringBuilder();
- for(int j=0;j<i;j++){
- parentCode.append(adminCodes[j]); //add all the previous
- parentCode.append('.'); //add the seperator char
- }
- parentCode.append(adminCodes[i]);//add the current (last) Element
- adminIds[i] =adminCode2featureId.get(parentCode.toString()); //might also add null!
- }
- }
- //now get the direct parents
- Map<Integer,Collection<Integer>> parents = new HashMap<Integer, Collection<Integer>>();
- getParents(id,parents);
- //add all parents
- doc.add(Properties.gn_parentFeature.toString(), getFeatureReferences(parents.keySet()));
- //get admin hierarchy
-
- Set<Integer> parentLevel;
- //add country
- if(adminIds[0]!=null){
- doc.add(Properties.gn_parentCountry.toString(), vf.createReference(geonamesFeatureBase+adminIds[0]));
- parentLevel = Collections.singleton(adminIds[0]);
- } else {
- parentLevel = Collections.emptySet();
- }
- //add the admin codes for the 4 levels
- parentLevel = addAdminLevel(doc, Properties.gn_parentADM1, parents, parentLevel, adminIds[1]);
- parentLevel = addAdminLevel(doc, Properties.gn_parentADM2, parents, parentLevel, adminIds[2]);
- parentLevel = addAdminLevel(doc, Properties.gn_parentADM3, parents, parentLevel, adminIds[3]);
- parentLevel = addAdminLevel(doc, Properties.gn_parentADM4, parents, parentLevel, adminIds[4]);
- }
- /**
- * This Method combines the information of <ul>
- * <li> the adminIds originating form the information in the main feature table of geonames
- * <li> hierarchy information originating from the hierarchy table.
- * </ul>
- * and combines them to the full admin regions hierarchy.<br>
- * This code would be much simpler if one would trust one of the two data source.
- * However first tests have shown, that both structures contain some errors!
- * @param doc The doc to add the data
- * @param property the property used for the level
- * @param parents the parent->child mappings for the current geonames feature
- * @param parentLevel the regions of the parent level (should be only one, but sometimes there are more).
- * This data are based on the hierarchy table.
- * @param adminId the region as stored in the geonames main table (only available for level 1 and 2)
- * @return the regions of this level (should be only one, but sometimes there are more)
- */
- private Set<Integer> addAdminLevel(Representation doc,Properties property, Map<Integer,Collection<Integer>> parents,Set<Integer> parentLevel, Integer adminId){
- Set<Integer> currentLevel = new HashSet<Integer>();
- //first add the admin1 originating from the admin info file
- if(adminId!=null){
- currentLevel.add(adminId);
- }
- for(Integer parent : parentLevel){
- //second add the admin1 via the childs of the country
- Collection<Integer> tmp = parents.get(parent);
- if(tmp != null){
- currentLevel.addAll(tmp);
- }
- }
- if(!currentLevel.isEmpty()){ //now add all the adm1 we found
- doc.add(property.toString(), getFeatureReferences(currentLevel));
- if(currentLevel.size()>1){ //write warning if there are multiple ids
- log.warn(String.format("Multiple %s for ID %s (ids: %s)",property.name(),doc.getId(),currentLevel.toString()));
- }
- }
- return currentLevel;
- }
- /**
- * Recursive method the finds all parents and adds the childs of the current
- * node (not all childs, but only those of the current tree)
- * @param id the id of the lower level
- * @param parents the set used to add all the parents/child mappings
- */
- private void getParents(Integer id, Map<Integer,Collection<Integer>> parents){
- Collection<Integer> current = parentFeature.get(id);
- if(current != null){
- for(Integer parent : current){
- Collection<Integer> childs = parents.get(parent);
- if(childs == null){
- childs = new HashSet<Integer>();
- parents.put(parent, childs);
- }
- if(childs.add(id)){
- getParents(parent, parents);
- }
- }
- }
- current = adminParentFeature.get(id);
- if(current != null){
- for(Integer parent : current){
- Collection<Integer> childs = parents.get(parent);
- if(childs == null){
- childs = new HashSet<Integer>();
- parents.put(parent, childs);
- }
- if(childs.add(id)){
- getParents(parent, parents);
- }
- }
- }
- }
-
- private int readCountryInfos() throws IOException{
- BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(countryInfoFile), Charset.forName("utf-8")));
- String line;
- int lineCount = 0;
- while((line = reader.readLine()) != null){
- if(line.indexOf('#')!=0 && line.length()>0){ //# is used as comment
- Tokenizer t = new Tokenizer(line);
- String code = null;
- Integer geonamesId = null;
- int i=1;
- for(;t.hasMoreElements();i++){
- String actToken = t.nextElement();
- if(i==1){
- code = actToken;
- }
- if(i==countryGeonamesIdPos){
- geonamesId = new Integer(actToken);
- break;
- }
- }
- if(i==countryGeonamesIdPos){
- adminCode2featureId.put(code,geonamesId);
- countryCode2featureId.put(code,geonamesId);
- lineCount++;
- } else {
- log.warn("Unable to parse countryInfo from Line "+line);
- }
- }
- }
- reader.close();
- reader = null;
- return lineCount;
- }
- /**
- * There are two sources of hierarchy in the geonames.org dumps. <p>
- * First the Admin Region Codes stored in the main table in combination with
- * the CountryInfo and the AdminRegion infos for the first two levels. This
- * uses the ISO country code and the additional number for linking the
- * Regions. Second the Hierarchy table providing parentID, childId, [type]
- * information. This uses featureIDs for linking. <p>
- * This Method reads the first data source into memory. For the country
- * related information it calls {@link #readCountryInfos()}.
- * @throws IOException
- */
- private void readAdminCodes() throws IOException{
- long start = System.currentTimeMillis();
- //first read adminCodes based on the countryInfos
- int lineCount = readCountryInfos();
- for(File adminCodeFile : adminCodesFiles){
- BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(adminCodeFile), Charset.forName("utf-8")));
- String line;
- while((line = reader.readLine()) != null){
- if(line.indexOf('#')!=0 && line.length()>0){ //# is used as comment
- lineCount++;
- //no tokenizer this time ... need only first and last column!
- String code = line.substring(0, line.indexOf('\t'));
- Integer geonamesId = new Integer(line.substring(line.lastIndexOf('\t')+1));
- adminCode2featureId.put(code, geonamesId);
- }
- }
- reader.close();
- reader = null;
- }
- log.info("read "+lineCount+" AdminCodes in "+(System.currentTimeMillis()-start)+"ms");
- }
- /**
- * This Method loads the alternate labels of geonames.org. Such labels are
- * used for multiple language support but also include postal codes, links
- * to wikipedia, airport codes ... see {@link NameType} for details.
- * TODO: This loads a lot of stuff into memory. Maybe one should consider to
- * use some caching framework like OSCache. Features are anyway sorted by
- * Country so often used labels would be in memory and all the labels that
- * are only used once can be serialised to the cache if in low memory
- * environments!
- * @throws IOException
- */
- private void readAlternateNames() throws IOException{
- BufferedReader reader;
- if(alternateNamesFile.getName().endsWith(".zip")){
- ZipFile alternateNamesArchive;
- try {
- alternateNamesArchive = new ZipFile(alternateNamesFile);
- } catch (IOException e) {
- //in the init we check if this is a file, exists and we can read ...
- // .. so throw a runtime exception here!
- throw new IllegalArgumentException("Unable to access geonames.org DB Dump file",e);
- }
- Enumeration<? extends ZipEntry> e = alternateNamesArchive.entries();
- ZipEntry entry = null;
- while(e.hasMoreElements()){
- ZipEntry cur = e.nextElement();
- if(!cur.isDirectory() && cur.getName().equalsIgnoreCase("alternatenames.txt")){
- entry = cur;
- break;
- }
- }
- if(entry ==null){
- throw new IllegalStateException("Archive with alternate Names does not contain the \"alternateNames.txt\" file!");
- } else {
- log.info("read alternate names from Archive Entry "+entry.getName());
- reader = new BufferedReader(new InputStreamReader(alternateNamesArchive.getInputStream(entry), Charset.forName("utf-8")));
- }
- } else {
- reader = new BufferedReader(new InputStreamReader(new FileInputStream(alternateNamesFile), Charset.forName("utf-8")));
- }
- FeatureName name;
- int lineCount = 0;
- EnumMap<NameType, int[]> labelTypeCounts = new EnumMap<NameType, int[]>(NameType.class);
- for(NameType entry :NameType.values()){
- labelTypeCounts.put(entry, new int[]{0});
- }
- String line;
- long start = System.currentTimeMillis();
- while((line = reader.readLine()) != null){
- try {
- name = new FeatureName(line);
- } catch (RuntimeException e) {
- log.warn("Unable to parse Featurname for line: "+line,e);
- continue;
- }
- List<FeatureName> names = featureNames.get(name.geonameID);
- if(names == null){
- names = new ArrayList<FeatureName>();
- featureNames.put(name.geonameID, names);
- }
- if(name.isPreferred()){
- names.add(0, name);
- } else {
- names.add(name);
- }
- lineCount++;
- labelTypeCounts.get(name.getLabelType())[0]++; //increase the count for this type!
- if(log.isDebugEnabled() && lineCount%10000==0){
- log.debug("processed "+lineCount+" labels");
- }
- }
- log.info("read "+lineCount+" alternate Names for "+featureNames.size()+" Features in "+(System.currentTimeMillis()-start)+"ms");
- for(Entry<NameType, int[]> count : labelTypeCounts.entrySet()){
- log.info(" "+count.getKey().toString()+": "+count.getValue()[0]);
- }
- }
- /**
- * There are two sources of hierarchy in the geonames.org dumps. <p>
- * First the Admin Region Codes stored in the main table in combination with
- * the CountryInfo and the AdminRegion infos for the first two levels. This
- * uses the ISO country code and the additional number for linking the
- * Regions. Second the Hierarchy table providing parentID, childId, [type]
- * information. This uses featureIDs for linking. <p>
- * This Method processes the second datasource and stores the child ->
- * parents mappings in memory. Administrative hierarchies are stored in a
- * different map. Note also that also for Administrative regions there are
- * some cases where a child has more than one parent.
- * @throws IOException
- */
- private void readHierarchy() throws IOException{
- BufferedReader reader;
- if(hierarchyFile.getName().endsWith(".zip")){
- ZipFile hierarchyArchive;
- try {
- hierarchyArchive = new ZipFile(hierarchyFile);
- } catch (IOException e) {
- //in the init we check if this is a file, exists and we can read ...
- // .. so throw a runtime exception here!
- throw new IllegalArgumentException("Unable to access geonames.org DB Dump hirarchy File",e);
- }
- Enumeration<? extends ZipEntry> e = hierarchyArchive.entries();
- ZipEntry entry = null;
- while(e.hasMoreElements()){
- ZipEntry cur = e.nextElement();
- if(!cur.isDirectory() && cur.getName().equalsIgnoreCase("hierarchy.txt")){
- entry = cur;
- break;
- }
- }
- if(entry ==null){
- throw new IllegalStateException("Archive with alternate Names does not contain the \"alternateNames.txt\" file!");
- } else {
- log.info("read hierarchy data fromArchive Entry "+entry.getName());
- reader = new BufferedReader(new InputStreamReader(hierarchyArchive.getInputStream(entry), Charset.forName("utf-8")));
- }
- } else {
- reader = new BufferedReader(new InputStreamReader(new FileInputStream(alternateNamesFile), Charset.forName("utf-8")));
- }
- String line;
- int lineCount=0;
- long start = System.currentTimeMillis();
- while((line = reader.readLine()) != null){
- lineCount++;
- Tokenizer t = new Tokenizer(line);
- Integer parent = new Integer(t.nextElement());
- Integer child = new Integer(t.nextElement());
- String type;
- if(t.hasMoreElements()){
- type = t.nextElement();
- } else {
- type = null;
- }
- if("ADM".equals(type)){
- Collection<Integer> parents = adminParentFeature.get(child);
- if(parents == null){
- parents = new ArrayList<Integer>(1); //there are only some exceptions with multiple parents
- adminParentFeature.put(child, parents);
- }
- parents.add(parent);
- } else {
- Collection<Integer> parents = parentFeature.get(child);
- if(parents == null){
- parents = new ArrayList<Integer>(3);
- parentFeature.put(child, parents);
- }
- parents.add(parent);
- }
- }
- log.info(String.format("read %d hierarchy relations in %dms",lineCount,System.currentTimeMillis()-start));
- }
-
-// private static void indexGeonamesOntology() throws RepositoryException, RDFParseException, IOException, SemanticSearchProviderException{
-// Repository ontRepository = new SailRepository(new MemoryStore());
-// ontRepository.initialize();
-// RepositoryConnection con = ontRepository.getConnection();
-// File geonamesOnt = new File(GeoNamesIndexer.geonamesOntFile);
-// System.out.println("Geonames Ontology: ");
-// System.out.println(" > name : "+geonamesOnt.getAbsolutePath());
-// System.out.println(" > exists : "+geonamesOnt.exists());
-// System.out.println(" > isFile : "+geonamesOnt.isFile());
-// //add the geonames Ont to the Repository
-// con.add(geonamesOnt, geonamesOntBase, RDFFormat.RDFXML);
-// RepositoryResult<Statement> results = con.getStatements(null, org.openrdf.model.vocabulary.RDF.TYPE, null, false);
-// Map<Resource,IndexInputDocument> geonamesOntResources = new HashMap<Resource,IndexInputDocument>();
-// log.info("Process Ontology:");
-// for(Statement stm: results.asList()){
-// log.debug(" Statement : "+stm.getSubject());
-// //check for contains to avaoid multiple processing if a resource has two types
-// if(!geonamesOntResources.containsKey(stm.getSubject())){
-// log.info(" > "+stm.getSubject());
-// geonamesOntResources.put(stm.getSubject(), getResourceValues(manager.getPathRegistry(), con, stm));
-// }
-// }
-// log.info("Index Geonames Ontology ("+geonamesOntResources.size()+" Resources)");
-// manager.getIndexProvider().indexDocuments(geonamesOntResources.values());
-// con.close();
-// con = null;
+
+ public static final String[] fieldMappings;
+ static {
+ ArrayList<String> mappings = new ArrayList<String>();
+ mappings.add(Properties.gn_name.toString());
+ //While indexing I use the UTF8 name as RDFS label (ASKII as fallback).
+ //THis should be also the case for updated documents
+ mappings.add(Properties.gn_name.toString()+" > "+Properties.rdfs_label.toString());
+ mappings.add(Properties.gn_alternateName.toString());
+ mappings.add(Properties.gn_countryCode.toString());
+ mappings.add(Properties.gn_featureClass.toString());
+ mappings.add(Properties.gn_featureCode.toString());
+ mappings.add(Properties.gn_officialName.toString());
+ //This cache copies the values of the sub-properties of parentFeature
+ //to the super property. So we need to write the according mappings
+ mappings.add(Properties.gn_parentADM1.toString());
+ mappings.add(Properties.gn_parentADM1.toString()+" > "+Properties.gn_parentFeature.toString());
+ mappings.add(Properties.gn_parentADM2.toString());
+ mappings.add(Properties.gn_parentADM2.toString()+" > "+Properties.gn_parentFeature.toString());
+ mappings.add(Properties.gn_parentADM3.toString());
+ mappings.add(Properties.gn_parentADM3.toString()+" > "+Properties.gn_parentFeature.toString());
+ mappings.add(Properties.gn_parentADM4.toString());
+ mappings.add(Properties.gn_parentADM4.toString()+" > "+Properties.gn_parentFeature.toString());
+ mappings.add(Properties.gn_parentCountry.toString());
+ mappings.add(Properties.gn_parentCountry.toString()+" > "+Properties.gn_parentFeature.toString());
+ mappings.add(Properties.gn_parentFeature.toString());
+ //population is converted to long (NOTE: population of Asia > Integer.MAX_VALUE)
+ mappings.add(Properties.gn_population.toString()+" | d=xsd:long");
+ mappings.add(Properties.gn_postalCode.toString());
+ mappings.add(Properties.gn_shortName.toString());
+ mappings.add(Properties.gn_wikipediaArticle.toString());
+ // Altitude is integer meters
+ mappings.add(Properties.geo_alt.toString()+" | d=xsd:int");
+ // Latitude and Longitude as BigDecimals (xsd:decimal)
+ mappings.add(Properties.geo_lat.toString()+" | d=xsd:decimal");
+ mappings.add(Properties.geo_long.toString()+" | d=xsd:decimal");
+ mappings.add(Properties.rdf_type.toString());
+ fieldMappings = mappings.toArray(new String[mappings.size()]);
+ }
+ Logger log = LoggerFactory.getLogger(GeoNamesIndexer.class);
+
+ private Yard yard;
+ private ValueFactory vf;
+ private boolean indexOntology = false;
+ private long startPosition;
+ private int indexingChunkSize = 1000;
+
+ private File dataDir;
+ private File geonamesOntFile;
+ private File alternateNamesFile;
+ private File hierarchyFile;
+ private List<File> adminCodesFiles;
+ private File countryInfoFile;
+ private final int countryGeonamesIdPos = 17;
+ private File geonamesArchiveFile;
+ private final String geonamesOntBase = "http://www.geonames.org/ontology/";
+ private final String geonamesFeatureBase = "http://sws.geonames.org/";
+ private final String geonamesCountryBase = "http://www.geonames.org/countries/";
+ //for date processing we use joda time!
+ private final Map<Integer,List<FeatureName>> featureNames = new TreeMap<Integer,List<FeatureName>>();
+ private final Map<String, Integer> adminCode2featureId = new TreeMap<String, Integer>();
+
+ private final Map<Integer,Collection<Integer>> parentFeature = new TreeMap<Integer, Collection<Integer>>();
+ private final Map<Integer,Collection<Integer>> adminParentFeature = new TreeMap<Integer, Collection<Integer>>();
+
+ private final Map<String, Integer> countryCode2featureId = new TreeMap<String, Integer>();
+ /**
+ * Key used to parse the Yard used for indexing
+ */
+ public static final String KEY_YARD = "eu.iksproject.rick.indexing.yard";
+ /**
+ * Used to parse the ID of the Item to start/resume the indexing
+ */
+ public static final String KEY_START_INDEX = "eu.iksproject.rick.indexing.startIndex";
+ /**
+ * State used to config if the geonames.org thesaurus should be included in the index.
+ */
+ public static final String KEY_INDEX_ONTOLOGY_STATE = "eu.iksproject.rick.indexing.geonames.indexOntology";
+
+ /**
+ * Key used to configure the directory that contains all the data needed
+ * for indexing geonames.org
+ */
+ public static final String KEY_DATA_DIR = "eu.iksproject.rick.indexing.geonames.dataDir";
+ /**
+ * key used to parse the name of the zip archive with the geonames.org dump.
+ * Typically the allcountry dump.
+ */
+ public static final String KEY_GEONAMES_ARCHIVE = "eu.iksproject.rick.indexing.geonames.dbdumpArchive";
+ /**
+ * Key used to parse the name of the file with the country informations
+ */
+ public static final String KEY_COUNTRY_INFOS = "eu.iksproject.rick.indexing.geonames.countryInfoFile";
+ /**
+ * Key used to parse the name of the file with the admin level1 codes
+ */
+ public static final String KEY_ADMIN1_CODES = "eu.iksproject.rick.indexing.geonames.admin1CodesFile";
+ /**
+ * Key used to parse the name of the file with the admin level2 codes
+ */
+ public static final String KEY_ADMIN2_CODES = "eu.iksproject.rick.indexing.geonames.admin2CodesFile";
+ /**
+ * Key used to parse the name of the file with the alternate names
+ */
+ public static final String KEY_ALTERNATE_NAMES = "eu.iksproject.rick.indexing.geonames.alternateNamesFile";
+ /**
+ * Key used to parse the name of the file with the geonames ontology
+ */
+ public static final String KEY_GEONAMES_ONTOLOGY = "eu.iksproject.rick.indexing.geonames.geonamesOntologyFile";
+
+ public static final String KEY_CHUNK_SIZE = "eu.iksproject.rick.indexing.geonames.chunkSize";
+ /**
+ * Key used to parse the hierarchy file
+ */
+ public static final String KEY_HIERARCHY = "eu.iksproject.rick.indexing.geonames.hierarchyFile";
+
+ private final static Map<String,Reference> indexDocRefs = new HashMap<String, Reference>();
+
+ private static enum Properties{
+ rdf_type(NamespaceEnum.rdf.getNamespace(),"type"),
+ rdfs_label(NamespaceEnum.rdfs.getNamespace(),"label"),
+ dc_creator(NamespaceEnum.dcTerms.getNamespace(),"creator"),
+ dc_date(NamespaceEnum.dcTerms.getNamespace(),"date"),
+ gn_Feature(NamespaceEnum.geonames.getNamespace(),"Feature"),
+ //gn_Country(NamespaceEnum.geonames.getNamespace(),"Country"),
+ gn_countryCode(NamespaceEnum.geonames.getNamespace(),"countryCode"),
+ //gn_Map(NamespaceEnum.geonames.getNamespace(),"Map"),
+ //gn_RDFData(NamespaceEnum.geonames.getNamespace(),"RDFData"),
+ //gn_WikipediaArticle(NamespaceEnum.geonames.getNamespace(),"WikipediaArticle"),
+ gn_parentFeature(NamespaceEnum.geonames.getNamespace(),"parentFeature"),
+ gn_parentCountry(NamespaceEnum.geonames.getNamespace(),"parentCountry"),
+ gn_parentADM1(NamespaceEnum.geonames.getNamespace(),"parentADM1"),
+ gn_parentADM2(NamespaceEnum.geonames.getNamespace(),"parentADM2"),
+ gn_parentADM3(NamespaceEnum.geonames.getNamespace(),"parentADM3"),
+ gn_parentADM4(NamespaceEnum.geonames.getNamespace(),"parentADM4"),
+ //gn_childrenFeatures(NamespaceEnum.geonames.getNamespace(),"childrenFeatures"),
+ //gn_inCountry(NamespaceEnum.geonames.getNamespace(),"inCountry"),
+ //gn_locatedIn(NamespaceEnum.geonames.getNamespace(),"locatedIn"),
+ //gn_locationMap(NamespaceEnum.geonames.getNamespace(),"locationMap"),
+ //gn_nearby(NamespaceEnum.geonames.getNamespace(),"nearby"),
+ //gn_nearbyFeatures(NamespaceEnum.geonames.getNamespace(),"nearbyFeatures"),
+ //gn_neighbour(NamespaceEnum.geonames.getNamespace(),"neighbour"),
+ //gn_neighbouringFeatures(NamespaceEnum.geonames.getNamespace(),"neighbouringFeatures"),
+ gn_wikipediaArticle(NamespaceEnum.geonames.getNamespace(),"wikipediaArticle"),
+ gn_featureClass(NamespaceEnum.geonames.getNamespace(),"featureClass"),
+ gn_featureCode(NamespaceEnum.geonames.getNamespace(),"featureCode"),
+ //gn_tag(NamespaceEnum.geonames.getNamespace(),"tag"),
+ gn_alternateName(NamespaceEnum.geonames.getNamespace(),"alternateName"),
+ gn_officialName(NamespaceEnum.geonames.getNamespace(),"officialName"),
+ gn_name(NamespaceEnum.geonames.getNamespace(),"name"),
+ gn_population(NamespaceEnum.geonames.getNamespace(),"population"),
+ gn_shortName(NamespaceEnum.geonames.getNamespace(),"shortName"),
+ gn_postalCode(NamespaceEnum.geonames.getNamespace(),"postalCode"),
+ geo_lat(NamespaceEnum.geo.getNamespace(),"lat"),
+ geo_long(NamespaceEnum.geo.getNamespace(),"long"),
+ geo_alt(NamespaceEnum.geo.getNamespace(),"alt"),
+ skos_notation(NamespaceEnum.skos.getNamespace(),"notation"),
+ skos_prefLabel(NamespaceEnum.skos.getNamespace(),"prefLabel"),
+ skos_altLabel(NamespaceEnum.skos.getNamespace(),"altLabel"),
+ skos_hiddenLabel(NamespaceEnum.skos.getNamespace(),"hiddenLabel"),
+ skos_note(NamespaceEnum.skos.getNamespace(),"note"),
+ skos_changeNote(NamespaceEnum.skos.getNamespace(),"changeNote"),
+ skos_definition(NamespaceEnum.skos.getNamespace(),"definition"),
+ skos_editorialNote(NamespaceEnum.skos.getNamespace(),"editorialNote"),
+ skos_example(NamespaceEnum.skos.getNamespace(),"example"),
+ skos_historyNote(NamespaceEnum.skos.getNamespace(),"historyNote"),
+ skos_scopeNote(NamespaceEnum.skos.getNamespace(),"scopeNote"),
+ skos_broader(NamespaceEnum.skos.getNamespace(),"broader"),
+ skos_narrower(NamespaceEnum.skos.getNamespace(),"narrower"),
+ skos_related(NamespaceEnum.skos.getNamespace(),"related"),
+ ;
+ String uri;
+ Properties(String namespace,String name){
+ uri = namespace+name;
+ }
+ @Override
+ public String toString() {
+ return uri;
+ }
+ }
+ public GeoNamesIndexer(Dictionary<String, Object> config) throws IllegalArgumentException {
+ this.yard = (Yard)config.get(KEY_YARD);
+ if(yard == null){
+ throw new IllegalArgumentException("Parsed config MUST CONTAIN a Yard. Use the key "+KEY_YARD+" to parse the YardInstance used to store the geonames.org index!");
+ } else {
+ log.info(String.format("Using Yard %s (id=%s) to index geonames.org",
+ yard.getName(),yard.getId()));
+ }
+ this.vf = yard.getValueFactory();
+ Long startIndex = (Long)config.get(KEY_START_INDEX);
+ if(startIndex != null && startIndex > 0l){
+ this.startPosition = startIndex;
+ } else {
+ this.startPosition = 0;
+ }
+ Integer chunkSize = (Integer)config.get(KEY_CHUNK_SIZE);
+ if(chunkSize != null && chunkSize>0){
+ this.indexingChunkSize = chunkSize;
+ } //else use default value of 1000
+ log.info(" ... start indexing at position "+startPosition);
+ Boolean indexOntology = (Boolean)config.get(KEY_INDEX_ONTOLOGY_STATE);
+ if(indexOntology != null){
+ this.indexOntology = indexOntology;
+ } else {
+ this.indexOntology = false;
+ }
+ log.info(" ... indexing geonames.org thesaurus="+indexOntology);
+ this.dataDir = checkFile(KEY_DATA_DIR, config, "/data");
+ this.geonamesArchiveFile = checkFile(KEY_GEONAMES_ARCHIVE, dataDir, config,"allCountries.zip");
+ this.countryInfoFile = checkFile(KEY_COUNTRY_INFOS, dataDir,config,"countryInfo.txt");
+ this.adminCodesFiles = new ArrayList<File>();
+ adminCodesFiles.add(checkFile(KEY_ADMIN1_CODES, dataDir, config,"admin1CodesASCII.txt"));
+ adminCodesFiles.add(checkFile(KEY_ADMIN2_CODES, dataDir, config,"admin2Codes.txt"));
+ if(this.indexOntology){
+ this.geonamesOntFile = checkFile(KEY_GEONAMES_ONTOLOGY, dataDir, config,"ontology_v2.2.1.rdf");
+ }
+ this.hierarchyFile = checkFile(KEY_HIERARCHY, dataDir, config, "hierarchy.zip");
+ this.alternateNamesFile = checkFile(KEY_ALTERNATE_NAMES, dataDir, config,"alternateNames.zip");
+ }
+ /**
+ * Create the index based on the parsed configuration
+ * @throws IOException On any error while reading one of the configuration files
+ * @throws YardException On any error while storing index features within the Yard
+ */
+ public void index() throws IOException, YardException{
+ readAdminCodes();
+ readHierarchy();
+ readAlternateNames();
+ indexGeonames();
+ writeCacheBaseConfiguration();
+ }
+ /**
+ * As the last step we need to create the baseMappings configuration
+ * needed to used the Index as RICK full cache!
+ * @throws YardException would be really bad if after successfully indexing
+ * about 8 millions of documents we get an error from the yard at the
+ * last possible opportunity :(
+ */
+ private void writeCacheBaseConfiguration() throws YardException {
+ FieldMapper baseMapper = new DefaultFieldMapperImpl(ValueConverterFactory.getInstance(vf));
+ log.info("Write BaseMappings for geonames.org Cache");
+ log.info(" > Mappings");
+ for(String mapping : GeoNamesIndexer.fieldMappings){
+ log.info(" - "+mapping);
+ baseMapper.addMapping(FieldMappingUtils.parseFieldMapping(mapping));
+ }
+ CacheUtils.storeBaseMappingsConfiguration(yard, baseMapper);
+ log.info(" < completed");
+ }
+ /**
+ * @param config
+ */
+ private File checkFile(String key,Dictionary<String, Object> config,Object defaultValue) {
+ return checkFile(key, null,config, defaultValue);
+ }
+ private File checkFile(String key,File directory,Dictionary<String, Object> config,Object defaultValue) {
+ File testFile;
+ Object fileName = config.get(key);
+ if(fileName == null){
+ if(defaultValue == null){
+ throw new IllegalArgumentException("Parsed Config MUST CONTAIN the a reference to the file for key "+key+"!");
+ } else {
+ fileName = defaultValue;
+ }
+ }
+ if(directory == null){
+ testFile = new File(fileName.toString());
+ } else {
+ testFile = new File(dataDir,fileName.toString());
+ }
+ if(!testFile.exists()){
+ throw new IllegalStateException("File "+fileName+" parsed by key "+key+" does not exist!");
+ }
+ if(directory == null && !testFile.isDirectory()){
+ throw new IllegalStateException("parsed data directory "+fileName+" exists, but is not a directory!");
+ }
+ if(directory != null && !testFile.isFile()){
+ throw new IllegalStateException("parsed data file "+fileName+" exists, but is not a file!");
+ }
+ if(!testFile.canRead()){
+ throw new IllegalStateException("Unable to read File "+fileName+" parsed for key "+key+"!");
+ }
+ return testFile;
+ }
+
+ private void indexGeonames() throws YardException, IOException {
+ ZipFile geonamesZipFile;
+ try {
+ geonamesZipFile = new ZipFile(geonamesArchiveFile);
+ } catch (IOException e) {
+ //in the init we check if this is a file, exists and we can read ...
+ // .. so throw a runtime exception here!
+ throw new IllegalArgumentException("Unable to access geonames.org DB Dump file",e);
+ }
+ for(Enumeration<? extends ZipEntry> e = geonamesZipFile.entries();e.hasMoreElements();){
+ ZipEntry entry = e.nextElement();
+ if(!entry.isDirectory() && !entry.getName().toLowerCase().startsWith("readme")){
+ log.info("add Entry "+entry.getName());
+ BufferedReader reader = new BufferedReader(new InputStreamReader(geonamesZipFile.getInputStream(entry), Charset.forName("utf-8")));
+ String line;
+ int pos = 0;
+ int blockPos =0;
+ List<Representation> currentBlock = new ArrayList<Representation>(indexingChunkSize);
+ long start = System.currentTimeMillis();
+ long iStart = start;
+ while((line = reader.readLine())!=null){
+ pos++;
+ if(pos>=startPosition){
+ try {
+ Representation indexedFeature = importFeature(line);
+ //log.info(ModelUtils.getRepresentationInfo(indexedFeature));
+ blockPos++;
+ currentBlock.add(indexedFeature);
+ if(blockPos == indexingChunkSize){
+ yard.store(currentBlock);
+ currentBlock.clear();
+ blockPos = 0;
+ }
+ } catch (RuntimeException e1){
+ log.warn("Exception while processing line "+line,e1);
+ throw e1;
+ } catch (YardException e1){
+ log.warn("YardException while processing lines "+(pos-blockPos)+"-"+(pos),e1);
+ throw e1;
+ }
+ if(pos%10000==0){
+ long now = System.currentTimeMillis();
+ float mean = ((float)(now-start))/(pos-startPosition);
+ float iMean = ((float)(now-iStart))/10000;
+ log.info(pos+" features processed ("+mean+"ms/feature; "+iMean+"ms/feature for the last 10000 features");
+ iStart=System.currentTimeMillis();
+ }
+ } else {
+ //remove alternate labels from the inMemoryMap for the ID to save memory
+ Integer id = new Integer(line.substring(0, line.indexOf('\t')));
+ featureNames.remove(id);
+ }
+ }
+ //indexing the remaining documents
+ yard.store(currentBlock);
+ currentBlock.clear();
+ blockPos = 0;
+ //the final commit
+ long now = System.currentTimeMillis();
+ float mean = ((float)(now-start))/(pos-startPosition);
+ log.info(pos+" features processed ("+mean+"ms/feature)");
+ }
+ }
+ }
+ private Reference getDocRef(String refString){
+ Reference ref = indexDocRefs.get(refString);
+ if(ref == null){
+ ref = yard.getValueFactory().createReference(refString);
+ indexDocRefs.put(refString, ref);
+ }
+ return ref;
+ }
+ private Collection<Reference> getFeatureReferences(Collection<Integer> ids){
+ List<Reference> refs = new ArrayList<Reference>(ids.size());
+ for(Integer id : ids){
+ if(id != null){
+ refs.add(vf.createReference(String.format("%s%s/", geonamesFeatureBase,id)));
+ }
+ }
+ return refs;
+ }
+ private Representation importFeature(String line){
+ Tokenizer t = new Tokenizer(line);
+ String id = t.nextElement();
+ Integer geoNamesId = Integer.parseInt(id);
+ //create a new Doc based on the first Element (geonamesID)
+ Representation doc = this.yard.create(String.format("%s%s/", geonamesFeatureBase,id));
+ //add the geonames:Feature type
+ doc.add(Properties.rdf_type.toString(), getDocRef(Properties.gn_Feature.toString()));
+ //add the UTF-8name
+ String utf8Label = t.nextElement();
+ doc.addNaturalText(Properties.gn_name.toString(),utf8Label);
+ //add the ASKII Name as rdfs:label
+ String askiiLabel = t.nextElement();
+ if(utf8Label == null){
+ utf8Label = askiiLabel; //use ASKII label as fallback for the utf8 version
+ }
+ doc.addNaturalText(Properties.rdfs_label.toString(),utf8Label);
+ //alternate Names (alternate names also include Airport codes, postal codes and Wikipedia links!
+ t.nextElement(); //consume this Element and use the alternateNames Map instead
+ List<FeatureName> alternateNames = featureNames.remove(geoNamesId); //use remove, because we need not need it a 2nd time!
+ if(alternateNames != null){
+ List<Text> altList = new ArrayList<Text>(alternateNames.size());
+ List<Text> officialList = new ArrayList<Text>(alternateNames.size());
+ List<String> postalCodes = new ArrayList<String>();
+ List<URL> wikipediaLinks = new ArrayList<URL>();
+ List<Text> shortNames = new ArrayList<Text>();
+ for(FeatureName name : alternateNames){
+ if(name.isNaturalLanguageLabel()){
+ Text act = vf.createText(name.getName(),name.getLang());
+ if(name.isPreferred()){
+ officialList.add(act);
+ } else {
+ altList.add(act);
+ }
+ if(name.isShortName()){
+ shortNames.add(act);
+ }
+ } else if(name.getLabelType() == NameType.postal){
+ postalCodes.add(name.getName());
+ } else if(name.getLabelType() == NameType.link){
+ if(name.getName().contains("wikipedia.org")){
+ try {
+ wikipediaLinks.add(new URL(name.getName()));
+ } catch (MalformedURLException e) {
+ log.warn("Unable to parse URL for link label "+name.getName());
+ //ignore
+ }
+ }
+ }
+ }
+ if(!altList.isEmpty()){
+ doc.add(Properties.gn_alternateName.toString(),altList);
+ }
+ if(!officialList.isEmpty()){
+ doc.add(Properties.gn_officialName.toString(),officialList);
+ }
+ if(!postalCodes.isEmpty()){
+ doc.add(Properties.gn_postalCode.toString(), postalCodes);
+ }
+ if(!wikipediaLinks.isEmpty()){
+ doc.add(Properties.gn_wikipediaArticle.toString(), wikipediaLinks);
+ }
+ if(!shortNames.isEmpty()){
+ doc.add(Properties.gn_shortName.toString(), shortNames);
+ }
+ }
+ //lat
+ doc.add(Properties.geo_lat.toString(),new BigDecimal(t.nextElement()));
+ //lon
+ doc.add(Properties.geo_long.toString(),new BigDecimal(t.nextElement()));
+ //featureClass
+ String featureClass = String.format("%s%s",NamespaceEnum.geonames,t.nextElement());
+ doc.add(Properties.gn_featureClass.toString(),getDocRef(featureClass));
+ //featureCode (-> need to use <featureClass>.<featureCode>!!)
+ doc.add(Properties.gn_featureCode.toString(),getDocRef(String.format("%s.%s",featureClass,t.nextElement())));
+ //countryCode
+ // -> geonames uses here the link to an HTML Page showing the Country
+ // We would like to use an Link to a SKOS:Concept representing the Country
+ // ... But luckily here we need only to add the URI!
+ Set<String> ccs = new HashSet<String>();
+ String countryCode = t.nextElement();
+ if(countryCode != null){
+ countryCode = countryCode.trim(); //need to trim because some country codes use ' ' to indicate null!
+ if(countryCode.length() == 2){ //Yes there are some features that are in no country!
+ ccs.add(countryCode);
+ }
+ }
+ //alternate countryCodes
+ String altCc = t.nextElement();
+ if(altCc != null){
+ StringTokenizer altCcT = new StringTokenizer(altCc,",");
+ while(altCcT.hasMoreElements()){
+ countryCode = altCcT.nextToken();
+ if(countryCode.length() ==2){
+ ccs.add(countryCode);
+ }
+ }
+ }
+ if(!ccs.isEmpty()){
+ doc.add(Properties.gn_countryCode.toString(),ccs);
+ }
+ //admin Codes 1-4
+ //first read them -> we need to consume the tokens anyway
+ String[] adminCodes = new String[] {
+ countryCode, //country
+ t.nextElement(), //ADM1
+ t.nextElement(), //ADM2
+ t.nextElement(), //ADM3
+ t.nextElement()};//ADM4
+ //Workaround for Admin1 -> add leading '0' for single Value
+ if(adminCodes[1] != null && adminCodes[1].length() < 2){
+ adminCodes[1] = '0'+adminCodes[1];
+ }
+ addParents(doc,geoNamesId,adminCodes);
+
+ //population
+ String populationString = t.nextElement();
+ if(populationString != null){
+ //NOTE: we need to used Long, because of Asia (3.800.000)
+ Long population = new Long(populationString);
+ if(population.intValue() > 0){
+ doc.add(Properties.gn_population.toString(),population);
+ }
+ }
+ //elevation
+ String latString = t.nextElement();
+ if(latString == null){
+ latString = t.nextElement(); //if no elevation than use the gtopo30
+ } else {
+ t.nextElement(); //if there is already en elevation, than consume these entry
+ }
+ Integer alt = new Integer(latString);
+ if(alt.intValue() > -9999){ //it looks like that -9999 is sometimes used as not known!
+ doc.add(Properties.geo_alt.toString(),alt);
+ }
+ //time zone
+ t.nextElement(); //not used
+ //mod-date
+ String modDateString = t.nextElement();
+ if(modDateString != null){
+ try {
+ doc.add(Properties.dc_date.toString(),TimeUtils.toDate(DataTypeEnum.DateTime, modDateString));
+ }catch (IllegalArgumentException e) {
+ log.warn(String.format("Unable to parse modificationDate for geonamesID %s from value %s",doc.getId(),modDateString));
+ }
+ }
+ //and add geonames.org as the creator!
+ doc.add(Properties.dc_creator.toString(),"http://www.geonames.org/");
+ return doc;
+ }
+
+ private void addParents(Representation doc,Integer id,String[] adminCodes){
+ Integer[] adminIds = new Integer[5];
+ //now process the admin Codes (including the country at index 0)
+ for(int i=0;i<adminCodes.length;i++){
+ if(adminCodes[i] != null && !adminCodes[i].equals("00")){ //00 is used to indicate not known
+ StringBuilder parentCode = new StringBuilder();
+ for(int j=0;j<i;j++){
+ parentCode.append(adminCodes[j]); //add all the previous
+ parentCode.append('.'); //add the seperator char
+ }
+ parentCode.append(adminCodes[i]);//add the current (last) Element
+ adminIds[i] =adminCode2featureId.get(parentCode.toString()); //might also add null!
+ }
+ }
+ //now get the direct parents
+ Map<Integer,Collection<Integer>> parents = new HashMap<Integer, Collection<Integer>>();
+ getParents(id,parents);
+ //add all parents
+ doc.add(Properties.gn_parentFeature.toString(), getFeatureReferences(parents.keySet()));
+ //get admin hierarchy
+
+ Set<Integer> parentLevel;
+ //add country
+ if(adminIds[0]!=null){
+ doc.add(Properties.gn_parentCountry.toString(), vf.createReference(geonamesFeatureBase+adminIds[0]));
+ parentLevel = Collections.singleton(adminIds[0]);
+ } else {
+ parentLevel = Collections.emptySet();
+ }
+ //add the admin codes for the 4 levels
+ parentLevel = addAdminLevel(doc, Properties.gn_parentADM1, parents, parentLevel, adminIds[1]);
+ parentLevel = addAdminLevel(doc, Properties.gn_parentADM2, parents, parentLevel, adminIds[2]);
+ parentLevel = addAdminLevel(doc, Properties.gn_parentADM3, parents, parentLevel, adminIds[3]);
+ parentLevel = addAdminLevel(doc, Properties.gn_parentADM4, parents, parentLevel, adminIds[4]);
+ }
+ /**
+ * This Method combines the information of <ul>
+ * <li> the adminIds originating form the information in the main feature table of geonames
+ * <li> hierarchy information originating from the hierarchy table.
+ * </ul>
+ * and combines them to the full admin regions hierarchy.<br>
+ * This code would be much simpler if one would trust one of the two data source.
+ * However first tests have shown, that both structures contain some errors!
+ * @param doc The doc to add the data
+ * @param property the property used for the level
+ * @param parents the parent->child mappings for the current geonames feature
+ * @param parentLevel the regions of the parent level (should be only one, but sometimes there are more).
+ * This data are based on the hierarchy table.
+ * @param adminId the region as stored in the geonames main table (only available for level 1 and 2)
+ * @return the regions of this level (should be only one, but sometimes there are more)
+ */
+ private Set<Integer> addAdminLevel(Representation doc,Properties property, Map<Integer,Collection<Integer>> parents,Set<Integer> parentLevel, Integer adminId){
+ Set<Integer> currentLevel = new HashSet<Integer>();
+ //first add the admin1 originating from the admin info file
+ if(adminId!=null){
+ currentLevel.add(adminId);
+ }
+ for(Integer parent : parentLevel){
+ //second add the admin1 via the childs of the country
+ Collection<Integer> tmp = parents.get(parent);
+ if(tmp != null){
+ currentLevel.addAll(tmp);
+ }
+ }
+ if(!currentLevel.isEmpty()){ //now add all the adm1 we found
+ doc.add(property.toString(), getFeatureReferences(currentLevel));
+ if(currentLevel.size()>1){ //write warning if there are multiple ids
+ log.warn(String.format("Multiple %s for ID %s (ids: %s)",property.name(),doc.getId(),currentLevel.toString()));
+ }
+ }
+ return currentLevel;
+ }
+ /**
+ * Recursive method the finds all parents and adds the childs of the current
+ * node (not all childs, but only those of the current tree)
+ * @param id the id of the lower level
+ * @param parents the set used to add all the parents/child mappings
+ */
+ private void getParents(Integer id, Map<Integer,Collection<Integer>> parents){
+ Collection<Integer> current = parentFeature.get(id);
+ if(current != null){
+ for(Integer parent : current){
+ Collection<Integer> childs = parents.get(parent);
+ if(childs == null){
+ childs = new HashSet<Integer>();
+ parents.put(parent, childs);
+ }
+ if(childs.add(id)){
+ getParents(parent, parents);
+ }
+ }
+ }
+ current = adminParentFeature.get(id);
+ if(current != null){
+ for(Integer parent : current){
+ Collection<Integer> childs = parents.get(parent);
+ if(childs == null){
+ childs = new HashSet<Integer>();
+ parents.put(parent, childs);
+ }
+ if(childs.add(id)){
+ getParents(parent, parents);
+ }
+ }
+ }
+ }
+
+ private int readCountryInfos() throws IOException{
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(countryInfoFile), Charset.forName("utf-8")));
+ String line;
+ int lineCount = 0;
+ while((line = reader.readLine()) != null){
+ if(line.indexOf('#')!=0 && line.length()>0){ //# is used as comment
+ Tokenizer t = new Tokenizer(line);
+ String code = null;
+ Integer geonamesId = null;
+ int i=1;
+ for(;t.hasMoreElements();i++){
+ String actToken = t.nextElement();
+ if(i==1){
+ code = actToken;
+ }
+ if(i==countryGeonamesIdPos){
+ geonamesId = new Integer(actToken);
+ break;
+ }
+ }
+ if(i==countryGeonamesIdPos){
+ adminCode2featureId.put(code,geonamesId);
+ countryCode2featureId.put(code,geonamesId);
+ lineCount++;
+ } else {
+ log.warn("Unable to parse countryInfo from Line "+line);
+ }
+ }
+ }
+ reader.close();
+ reader = null;
+ return lineCount;
+ }
+ /**
+ * There are two sources of hierarchy in the geonames.org dumps. <p>
+ * First the Admin Region Codes stored in the main table in combination with
+ * the CountryInfo and the AdminRegion infos for the first two levels. This
+ * uses the ISO country code and the additional number for linking the
+ * Regions. Second the Hierarchy table providing parentID, childId, [type]
+ * information. This uses featureIDs for linking. <p>
+ * This Method reads the first data source into memory. For the country
+ * related information it calls {@link #readCountryInfos()}.
+ * @throws IOException
+ */
+ private void readAdminCodes() throws IOException{
+ long start = System.currentTimeMillis();
+ //first read adminCodes based on the countryInfos
+ int lineCount = readCountryInfos();
+ for(File adminCodeFile : adminCodesFiles){
+ BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(adminCodeFile), Charset.forName("utf-8")));
+ String line;
+ while((line = reader.readLine()) != null){
+ if(line.indexOf('#')!=0 && line.length()>0){ //# is used as comment
+ lineCount++;
+ //no tokenizer this time ... need only first and last column!
+ String code = line.substring(0, line.indexOf('\t'));
+ Integer geonamesId = new Integer(line.substring(line.lastIndexOf('\t')+1));
+ adminCode2featureId.put(code, geonamesId);
+ }
+ }
+ reader.close();
+ reader = null;
+ }
+ log.info("read "+lineCount+" AdminCodes in "+(System.currentTimeMillis()-start)+"ms");
+ }
+ /**
+ * This Method loads the alternate labels of geonames.org. Such labels are
+ * used for multiple language support but also include postal codes, links
+ * to wikipedia, airport codes ... see {@link NameType} for details.
+ * TODO: This loads a lot of stuff into memory. Maybe one should consider to
+ * use some caching framework like OSCache. Features are anyway sorted by
+ * Country so often used labels would be in memory and all the labels that
+ * are only used once can be serialised to the cache if in low memory
+ * environments!
+ * @throws IOException
+ */
+ private void readAlternateNames() throws IOException{
+ BufferedReader reader;
+ if(alternateNamesFile.getName().endsWith(".zip")){
+ ZipFile alternateNamesArchive;
+ try {
+ alternateNamesArchive = new ZipFile(alternateNamesFile);
+ } catch (IOException e) {
+ //in the init we check if this is a file, exists and we can read ...
+ // .. so throw a runtime exception here!
+ throw new IllegalArgumentException("Unable to access geonames.org DB Dump file",e);
+ }
+ Enumeration<? extends ZipEntry> e = alternateNamesArchive.entries();
+ ZipEntry entry = null;
+ while(e.hasMoreElements()){
+ ZipEntry cur = e.nextElement();
+ if(!cur.isDirectory() && cur.getName().equalsIgnoreCase("alternatenames.txt")){
+ entry = cur;
+ break;
+ }
+ }
+ if(entry ==null){
+ throw new IllegalStateException("Archive with alternate Names does not contain the \"alternateNames.txt\" file!");
+ } else {
+ log.info("read alternate names from Archive Entry "+entry.getName());
+ reader = new BufferedReader(new InputStreamReader(alternateNamesArchive.getInputStream(entry), Charset.forName("utf-8")));
+ }
+ } else {
+ reader = new BufferedReader(new InputStreamReader(new FileInputStream(alternateNamesFile), Charset.forName("utf-8")));
+ }
+ FeatureName name;
+ int lineCount = 0;
+ EnumMap<NameType, int[]> labelTypeCounts = new EnumMap<NameType, int[]>(NameType.class);
+ for(NameType entry :NameType.values()){
+ labelTypeCounts.put(entry, new int[]{0});
+ }
+ String line;
+ long start = System.currentTimeMillis();
+ while((line = reader.readLine()) != null){
+ try {
+ name = new FeatureName(line);
+ } catch (RuntimeException e) {
+ log.warn("Unable to parse Featurname for line: "+line,e);
+ continue;
+ }
+ List<FeatureName> names = featureNames.get(name.geonameID);
+ if(names == null){
+ names = new ArrayList<FeatureName>();
+ featureNames.put(name.geonameID, names);
+ }
+ if(name.isPreferred()){
+ names.add(0, name);
+ } else {
+ names.add(name);
+ }
+ lineCount++;
+ labelTypeCounts.get(name.getLabelType())[0]++; //increase the count for this type!
+ if(log.isDebugEnabled() && lineCount%10000==0){
+ log.debug("processed "+lineCount+" labels");
+ }
+ }
+ log.info("read "+lineCount+" alternate Names for "+featureNames.size()+" Features in "+(System.currentTimeMillis()-start)+"ms");
+ for(Entry<NameType, int[]> count : labelTypeCounts.entrySet()){
+ log.info(" "+count.getKey().toString()+": "+count.getValue()[0]);
+ }
+ }
+ /**
+ * There are two sources of hierarchy in the geonames.org dumps. <p>
+ * First the Admin Region Codes stored in the main table in combination with
+ * the CountryInfo and the AdminRegion infos for the first two levels. This
+ * uses the ISO country code and the additional number for linking the
+ * Regions. Second the Hierarchy table providing parentID, childId, [type]
+ * information. This uses featureIDs for linking. <p>
+ * This Method processes the second datasource and stores the child ->
+ * parents mappings in memory. Administrative hierarchies are stored in a
+ * different map. Note also that also for Administrative regions there are
+ * some cases where a child has more than one parent.
+ * @throws IOException
+ */
+ private void readHierarchy() throws IOException{
+ BufferedReader reader;
+ if(hierarchyFile.getName().endsWith(".zip")){
+ ZipFile hierarchyArchive;
+ try {
+ hierarchyArchive = new ZipFile(hierarchyFile);
+ } catch (IOException e) {
+ //in the init we check if this is a file, exists and we can read ...
+ // .. so throw a runtime exception here!
+ throw new IllegalArgumentException("Unable to access geonames.org DB Dump hirarchy File",e);
+ }
+ Enumeration<? extends ZipEntry> e = hierarchyArchive.entries();
+ ZipEntry entry = null;
+ while(e.hasMoreElements()){
+ ZipEntry cur = e.nextElement();
+ if(!cur.isDirectory() && cur.getName().equalsIgnoreCase("hierarchy.txt")){
+ entry = cur;
+ break;
+ }
+ }
+ if(entry ==null){
+ throw new IllegalStateException("Archive with alternate Names does not contain the \"alternateNames.txt\" file!");
+ } else {
+ log.info("read hierarchy data fromArchive Entry "+entry.getName());
+ reader = new BufferedReader(new InputStreamReader(hierarchyArchive.getInputStream(entry), Charset.forName("utf-8")));
+ }
+ } else {
+ reader = new BufferedReader(new InputStreamReader(new FileInputStream(alternateNamesFile), Charset.forName("utf-8")));
+ }
+ String line;
+ int lineCount=0;
+ long start = System.currentTimeMillis();
+ while((line = reader.readLine()) != null){
+ lineCount++;
+ Tokenizer t = new Tokenizer(line);
+ Integer parent = new Integer(t.nextElement());
+ Integer child = new Integer(t.nextElement());
+ String type;
+ if(t.hasMoreElements()){
+ type = t.nextElement();
+ } else {
+ type = null;
+ }
+ if("ADM".equals(type)){
+ Collection<Integer> parents = adminParentFeature.get(child);
+ if(parents == null){
+ parents = new ArrayList<Integer>(1); //there are only some exceptions with multiple parents
+ adminParentFeature.put(child, parents);
+ }
+ parents.add(parent);
+ } else {
+ Collection<Integer> parents = parentFeature.get(child);
+ if(parents == null){
+ parents = new ArrayList<Integer>(3);
+ parentFeature.put(child, parents);
+ }
+ parents.add(parent);
+ }
+ }
+ log.info(String.format("read %d hierarchy relations in %dms",lineCount,System.currentTimeMillis()-start));
+ }
+
+// private static void indexGeonamesOntology() throws RepositoryException, RDFParseException, IOException, SemanticSearchProviderException{
+// Repository ontRepository = new SailRepository(new MemoryStore());
+// ontRepository.initialize();
+// RepositoryConnection con = ontRepository.getConnection();
+// File geonamesOnt = new File(GeoNamesIndexer.geonamesOntFile);
+// System.out.println("Geonames Ontology: ");
+// System.out.println(" > name : "+geonamesOnt.getAbsolutePath());
+// System.out.println(" > exists : "+geonamesOnt.exists());
+// System.out.println(" > isFile : "+geonamesOnt.isFile());
+// //add the geonames Ont to the Repository
+// con.add(geonamesOnt, geonamesOntBase, RDFFormat.RDFXML);
+// RepositoryResult<Statement> results = con.getStatements(null, org.openrdf.model.vocabulary.RDF.TYPE, null, false);
+// Map<Resource,IndexInputDocument> geonamesOntResources = new HashMap<Resource,IndexInputDocument>();
+// log.info("Process Ontology:");
+// for(Statement stm: results.asList()){
+// log.debug(" Statement : "+stm.getSubject());
+// //check for contains to avaoid multiple processing if a resource has two types
+// if(!geonamesOntResources.containsKey(stm.getSubject())){
+// log.info(" > "+stm.getSubject());
+// geonamesOntResources.put(stm.getSubject(), getResourceValues(manager.getPathRegistry(), con, stm));
+// }
+// }
+// log.info("Index Geonames Ontology ("+geonamesOntResources.size()+" Resources)");
+// manager.getIndexProvider().indexDocuments(geonamesOntResources.values());
+// con.close();
+// con = null;
//
-// }
-// private static IndexInputDocument getResourceValues(PathRegistry pathRegistry,
-// RepositoryConnection con, Statement stm) throws RepositoryException {
-// IndexInputDocument inputDoc = new IndexInputDocument(stm.getSubject().stringValue());
-// RepositoryResult<Statement> designValues = con.getStatements(stm.getSubject(),null,null,false);
-// for(Statement value: designValues.asList()){
-// log.debug(" "+value.getPredicate()+"="+value.getObject());
-// PathElement pathElement = pathRegistry.getPathElement(value.getPredicate().stringValue());
-// //in the geonames Data the lat/lon/alt are not marked with the dataType
-// // -> therefore try to parse the dataType from the String value!
-// inputDoc.add(pathElement, value.getObject());
-// }
-// debugInputDoc(inputDoc);
-// return inputDoc;
-// }
-
- public static final class FeatureName{
- enum NameType {
- naturalLanguage,
- postal,
- link,
- abbreviation,
- airportCode,
- unknown
- }
- private final NameType type;
- private final int labelID;
- private final Integer geonameID;
- private final String name;
- private final String lang;
- private final boolean preferred;
- private final boolean shortName;
- private final static String TRUE = "1";
- protected FeatureName(String line){
- Tokenizer t = new Tokenizer(line);
- labelID = Integer.parseInt(t.nextElement()); //first Elem the labelID
- geonameID = Integer.parseInt(t.nextElement());
- String lang = t.nextElement();
- if(lang != null && (lang.length() == 2 || lang.length() == 3)){
- this.lang = lang;
- } else {
- this.lang = null; //no valied lang Code
- }
- if(lang == null || lang.length()<=3){
- type = NameType.naturalLanguage;
- } else if("post".equals(lang)){
- type = NameType.postal;
- } else if("link".equals(lang)) {
- type = NameType.link;
- } else if("abbr".equals(lang)) {
- type = NameType.abbreviation;
- } else if("iata".equals(lang) || "icao".equals(lang) || "faac".equals(lang)){
- type = NameType.airportCode;
- } else {
- type = NameType.unknown; // e.g. fr_1793 for French Revolution names
- }
- name = t.nextElement();
- if(name == null){
- throw new IllegalStateException(" Unable to parse name from line:" + line);
- }
- String act = t.nextElement();
- this.preferred = act != null && act.equals(TRUE);
- act = t.nextElement();
- this.shortName = act != null && act.equals(TRUE);
- }
- public final Integer getGeonameID() {
- return geonameID;
- }
- public final String getName() {
- return name;
- }
- public final String getLang() {
- return lang;
- }
- public final boolean isPreferred() {
- return preferred;
- }
- public final boolean isShortName() {
- return shortName;
- }
- public final boolean isNaturalLanguageLabel(){
- return type == NameType.naturalLanguage;
- }
- public final NameType getLabelType(){
- return type;
- }
- @Override
- public final boolean equals(Object obj) {
- return obj != null &&
- obj instanceof FeatureName &&
- ((FeatureName)obj).labelID == labelID;
- }
- @Override
- public final int hashCode() {
- return labelID;
- }
- public final String toString(){
- return name+(lang!=null?('@'+lang):"");
- }
- }
- public static class Tokenizer implements Enumeration<String>{
- private static final String DELIM ="\t";
- private final StringTokenizer t;
- boolean prevElementWasNull = true;
- public Tokenizer(String data){
- t = new StringTokenizer(data, DELIM, true);
- }
- @Override
- public boolean hasMoreElements() {
- return t.hasMoreTokens();
- }
-
- @Override
- public String nextElement() {
- if(!prevElementWasNull){
- t.nextElement();//dump the delim
- }
- if(!t.hasMoreElements()){
- //this indicated, that the current Element is
- // - the last Element
- // - and is null
- prevElementWasNull = true;
- return null;
- } else {
- String act = t.nextToken();
- if(DELIM.equals(act)){
- prevElementWasNull = true;
- return null;
- } else {
- prevElementWasNull = false;
- return act;
- }
- }
- }
- }
+// }
+// private static IndexInputDocument getResourceValues(PathRegistry pathRegistry,
+// RepositoryConnection con, Statement stm) throws RepositoryException {
+// IndexInputDocument inputDoc = new IndexInputDocument(stm.getSubject().stringValue());
+// RepositoryResult<Statement> designValues = con.getStatements(stm.getSubject(),null,null,false);
+// for(Statement value: designValues.asList()){
+// log.debug(" "+value.getPredicate()+"="+value.getObject());
+// PathElement pathElement = pathRegistry.getPathElement(value.getPredicate().stringValue());
+// //in the geonames Data the lat/lon/alt are not marked with the dataType
+// // -> therefore try to parse the dataType from the String value!
+// inputDoc.add(pathElement, value.getObject());
+// }
+// debugInputDoc(inputDoc);
+// return inputDoc;
+// }
+
+ public static final class FeatureName{
+ enum NameType {
+ naturalLanguage,
+ postal,
+ link,
+ abbreviation,
+ airportCode,
+ unknown
+ }
+ private final NameType type;
+ private final int labelID;
+ private final Integer geonameID;
+ private final String name;
+ private final String lang;
+ private final boolean preferred;
+ private final boolean shortName;
+ private final static String TRUE = "1";
+ protected FeatureName(String line){
+ Tokenizer t = new Tokenizer(line);
+ labelID = Integer.parseInt(t.nextElement()); //first Elem the labelID
+ geonameID = Integer.parseInt(t.nextElement());
+ String lang = t.nextElement();
+ if(lang != null && (lang.length() == 2 || lang.length() == 3)){
[... 98 lines stripped ...]