You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by ct...@apache.org on 2017/05/09 19:51:30 UTC

[1/2] lucene-solr:jira/solr-10290: SOLR-10296: fix glossary headings; fix callout font used in PDF

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-10290 8436b4050 -> 96058f824


SOLR-10296: fix glossary headings; fix callout font used in PDF


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/07d81790
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/07d81790
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/07d81790

Branch: refs/heads/jira/solr-10290
Commit: 07d81790f00e25db28e495e16688cf6d70bfeeb5
Parents: 8436b40
Author: Cassandra Targett <ct...@apache.org>
Authored: Tue May 9 14:00:49 2017 -0500
Committer: Cassandra Targett <ct...@apache.org>
Committed: Tue May 9 14:00:49 2017 -0500

----------------------------------------------------------------------
 .../field-type-definitions-and-properties.adoc  |  8 +---
 solr/solr-ref-guide/src/managed-resources.adoc  |  6 ---
 .../src/pdf/themes/refguide-theme.yml           | 20 ++++------
 solr/solr-ref-guide/src/solr-glossary.adoc      | 41 ++++++++++----------
 4 files changed, 29 insertions(+), 46 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/07d81790/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc
index 5454b8f..01bfe11 100644
--- a/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc
+++ b/solr/solr-ref-guide/src/field-type-definitions-and-properties.adoc
@@ -16,12 +16,6 @@ A field type definition can include four types of information:
 
 Field types are defined in `schema.xml`. Each field type is defined between `fieldType` elements. They can optionally be grouped within a `types` element. Here is an example of a field type definition for a type called `text_general`:
 
-// TODO callout isn't working in the PDF...
-//    http://asciidoctor.org/docs/user-manual/#xml-callouts
-// TODO: relevant github issues w/possible font related work arounds...
-//  - https://github.com/asciidoctor/asciidoctor-pdf/issues/409
-//  - https://github.com/asciidoctor/asciidoctor-pdf/issues/377
-//  - https://github.com/asciidoctor/asciidoctor-pdf/issues/133
 [source,xml,subs="verbatim,callouts"]
 ----
 <fieldType name="text_general" class="solr.TextField" positionIncrementGap="100"> --<1>
@@ -45,7 +39,7 @@ Field types are defined in `schema.xml`. Each field type is defined between `fie
 <1> The first line in the example above contains the field type name, `text_general`, and the name of the implementing class, `solr.TextField`.
 <2> The rest of the definition is about field analysis, described in <<understanding-analyzers-tokenizers-and-filters.adoc#understanding-analyzers-tokenizers-and-filters,Understanding Analyzers, Tokenizers, and Filters>>.
 
-The implementing class is responsible for making sure the field is handled correctly. In the class names in `schema.xml`, the string `solr` is shorthand for `org.apache.solr.schema` or `org.apache.solr.analysis`. Therefore, `solr.TextField` is really `org.apache.solr.schema.TextField.`.
+The implementing class is responsible for making sure the field is handled correctly. In the class names in `schema.xml`, the string `solr` is shorthand for `org.apache.solr.schema` or `org.apache.solr.analysis`. Therefore, `solr.TextField` is really `org.apache.solr.schema.TextField`.
 
 == Field Type Properties
 

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/07d81790/solr/solr-ref-guide/src/managed-resources.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/managed-resources.adoc b/solr/solr-ref-guide/src/managed-resources.adoc
index 530853c..7e91ee3 100644
--- a/solr/solr-ref-guide/src/managed-resources.adoc
+++ b/solr/solr-ref-guide/src/managed-resources.adoc
@@ -27,12 +27,6 @@ Let's begin learning about managed resources by looking at a couple of examples
 
 To begin, you need to define a field type that uses the <<filter-descriptions.adoc#FilterDescriptions-ManagedStopFilter,ManagedStopFilterFactory>>, such as:
 
-// TODO callout isn't working in the PDF...
-//    http://asciidoctor.org/docs/user-manual/#xml-callouts
-// TODO: relevant github issues w/possible font related work arounds...
-//  - https://github.com/asciidoctor/asciidoctor-pdf/issues/409
-//  - https://github.com/asciidoctor/asciidoctor-pdf/issues/377
-//  - https://github.com/asciidoctor/asciidoctor-pdf/issues/133
 [source,xml,subs="verbatim,callouts"]
 ----
 <fieldType name="managed_en" positionIncrementGap="100">

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/07d81790/solr/solr-ref-guide/src/pdf/themes/refguide-theme.yml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/pdf/themes/refguide-theme.yml b/solr/solr-ref-guide/src/pdf/themes/refguide-theme.yml
index 5e9987c..5e272ce 100644
--- a/solr/solr-ref-guide/src/pdf/themes/refguide-theme.yml
+++ b/solr/solr-ref-guide/src/pdf/themes/refguide-theme.yml
@@ -13,22 +13,16 @@ font:
       bold: Inconsolata/Inconsolata-Bold.ttf
       italic: Inconsolata/Inconsolata-Regular.ttf
       bold_italic: Inconsolata/Inconsolata-Bold.ttf
-    # M+ 1mn is only used for callout numbers in code examples
-    M+ 1mn:
+    # M+1mn is from the asciidoctor-pdf project
+    # Provides the glyphs for callout numbers (see conum section below)
+    # This is a fallback font, and will only be used when it can fill in missing glyphs from other fonts
+    M+1mn Fallback:
       normal: mplus1mn/mplus1mn-regular-ascii-conums.ttf
       bold: mplus1mn/mplus1mn-bold-ascii.ttf
       italic: mplus1mn/mplus1mn-italic-ascii.ttf
       bold_italic: mplus1mn/mplus1mn-bold_italic-ascii.ttf
-    # M+ 1p supports Latin, Latin-1 Supplement, Latin Extended, Greek, Cyrillic, Vietnamese, Japanese & an assortment of symbols
-    # It also provides arrows for ->, <-, => and <= replacements in case these glyphs are missing from font
-    # This is a fallback font, and will only be used when it can fill in missing glyphs from other fonts
-    M+ 1p Fallback:
-      normal: mplus1p-regular-fallback.ttf
-      bold: mplus1p-regular-fallback.ttf
-      italic: mplus1p-regular-fallback.ttf
-      bold_italic: mplus1p-regular-fallback.ttf
   fallbacks:
-    - M+ 1p Fallback
+    - M+1mn Fallback
 # page-level settings apply to the entire page
 page:
   background_color: '#ffffff'
@@ -163,9 +157,9 @@ blockquote:
   cite_font_color: '#999999'
 #conums are used for inline callouts
 conum:
-  font_family: M+ 1mn
+  font_family: M+1mn Fallback
   font_color: $literal_font_color
-  font_size: $base_font_size
+  font_size: $base_font_size_large
   line_height: 4 / 3
 example:
   border_color: $base_border_color

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/07d81790/solr/solr-ref-guide/src/solr-glossary.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/src/solr-glossary.adoc b/solr/solr-ref-guide/src/solr-glossary.adoc
index 334dbec..829293a 100644
--- a/solr/solr-ref-guide/src/solr-glossary.adoc
+++ b/solr/solr-ref-guide/src/solr-glossary.adoc
@@ -3,31 +3,32 @@
 :page-permalink: solr-glossary.html
 :page-toc: false
 
-Where possible, terms are linked to relevant parts of the Solr Reference Guide for more information.
+These are common terms used with Solr.
+
+== Solr Terms
 
-'''''
+Where possible, terms are linked to relevant parts of the Solr Reference Guide for more information.
 
 *Jump to a letter:*
 
 <<SolrGlossary-A,A>> <<SolrGlossary-B,B>> <<SolrGlossary-C,C>> <<SolrGlossary-D,D>> <<SolrGlossary-E,E>> <<SolrGlossary-F,F>> G H <<SolrGlossary-I,I>> J K <<SolrGlossary-L,L>> <<SolrGlossary-M,M>> <<SolrGlossary-N,N>> <<SolrGlossary-O,O>> P <<SolrGlossary-Q,Q>> <<SolrGlossary-R,R>> <<SolrGlossary-S,S>> <<SolrGlossary-T,T>> U V <<SolrGlossary-W,W>> X Y <<SolrGlossary-Z,Z>>
 
-[glossary]
 
 [[SolrGlossary-A]]
-== A
+=== A
 
 [[atomicupdates]]<<updating-parts-of-documents.adoc#UpdatingPartsofDocuments-AtomicUpdates,Atomic updates>>::
 An approach to updating only one or more fields of a document, instead of reindexing the entire document.
 
 
 [[SolrGlossary-B]]
-== B
+=== B
 
 [[booleanoperators]]Boolean operators::
 These control the inclusion or exclusion of keywords in a query by using operators such as AND, OR, and NOT.
 
 [[SolrGlossary-C]]
-== C
+=== C
 
 [[cluster]]Cluster::
 In Solr, a cluster is a set of Solr nodes operating in coordination with each other via <<zookeeper,ZooKeeper>>, and managed as a unit. A cluster may contain many collections. See also <<solrclouddef,SolrCloud>>.
@@ -47,7 +48,7 @@ An individual Solr instance (represents a logical index). Multiple cores can run
 To re-initialize a Solr core after changes to `schema.xml`, `solrconfig.xml` or other configuration files.
 
 [[SolrGlossary-D]]
-== D
+=== D
 
 [[distributedsearch]]Distributed search::
 Distributed search is one where queries are processed across more than one <<shard,Shard>>.
@@ -56,13 +57,13 @@ Distributed search is one where queries are processed across more than one <<sha
 A group of <<field,fields>> and their values. Documents are the basic unit of data in a <<collection,collection>>. Documents are assigned to <<shard,shards>> using standard hashing, or by specifically assigning a shard within the document ID. Documents are versioned after each write operation.
 
 [[SolrGlossary-E]]
-== E
+=== E
 
 [[ensemble]]Ensemble::
 A <<zookeeper,ZooKeeper>> term to indicate multiple ZooKeeper instances running simultaneously and in coordination with each other for fault tolerance.
 
 [[SolrGlossary-F]]
-== F
+=== F
 
 [[facet]]Facet::
 The arrangement of search results into categories based on indexed terms.
@@ -71,7 +72,7 @@ The arrangement of search results into categories based on indexed terms.
 The content to be indexed/searched along with metadata defining how the content should be processed by Solr.
 
 [[SolrGlossary-I]]
-== I
+=== I
 
 [[idf]]Inverse document frequency (IDF)::
 A measure of the general importance of a term. It is calculated as the number of total Documents divided by the number of Documents that a particular word occurs in the collection. See http://en.wikipedia.org/wiki/Tf-idf and {lucene-javadocs}/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html[the Lucene TFIDFSimilarity javadocs] for more info on TF-IDF based scoring and Lucene scoring in particular. See also <<termfrequency,Term frequency>>.
@@ -80,19 +81,19 @@ A measure of the general importance of a term. It is calculated as the number of
 A way of creating a searchable index that lists every word and the documents that contain those words, similar to an index in the back of a book which lists words and the pages on which they can be found. When performing keyword searches, this method is considered more efficient than the alternative, which would be to create a list of documents paired with every word used in each document. Since users search using terms they expect to be in documents, finding the term before the document saves processing resources and time.
 
 [[SolrGlossary-L]]
-== L
+=== L
 
 [[leader]]Leader::
 A single <<replica,Replica>> for each <<shard,Shard>> that takes charge of coordinating index updates (document additions or deletions) to other replicas in the same shard. This is a transient responsibility assigned to a node via an election, if the current Shard Leader goes down, a new node will automatically be elected to take its place. See also <<solrclouddef,SolrCloud>>.
 
 [[SolrGlossary-M]]
-== M
+=== M
 
 [[metadata]]Metadata::
 Literally, _data about data_. Metadata is information about a document, such as its title, author, or location.
 
 [[SolrGlossary-N]]
-== N
+=== N
 
 [[naturallanguagequery]]Natural language query::
 A search that is entered as a user would normally speak or write, as in, "What is aspirin?"
@@ -101,7 +102,7 @@ A search that is entered as a user would normally speak or write, as in, "What i
 A JVM instance running Solr. Also known as a Solr server.
 
 [[SolrGlossary-O]]
-== O
+=== O
 
 [[optimisticconcurrency]]<<updating-parts-of-documents.adoc#UpdatingPartsofDocuments-OptimisticConcurrency,Optimistic concurrency>>::
 Also known as "optimistic locking", this is an approach that allows for updates to documents currently in the index while retaining locking or version control.
@@ -110,13 +111,13 @@ Also known as "optimistic locking", this is an approach that allows for updates
 A single node in <<solrclouddef,SolrCloud>> that is responsible for processing and coordinating actions involving the entire cluster. It keeps track of the state of existing nodes, collections, shards, and replicas, and assigns new replicas to nodes. This is a transient responsibility assigned to a node via an election, if the current Overseer goes down, a new node will be automatically elected to take its place. See also <<solrclouddef,SolrCloud>>.
 
 [[SolrGlossary-Q]]
-== Q
+=== Q
 
 [[query-parser]]Query parser::
 A query parser processes the terms entered by a user.
 
 [[SolrGlossary-R]]
-== R
+=== R
 
 [[recall]]Recall::
 The ability of a search engine to retrieve _all_ of the possible matches to a user's query.
@@ -135,7 +136,7 @@ A method of copying a master index from one server to one or more "slave" or "ch
 Logic and configuration parameters that tell Solr how to handle incoming "requests", whether the requests are to return search results, to index documents, or to handle other custom situations.
 
 [[SolrGlossary-S]]
-== S
+=== S
 
 [[searchcomponent]]<<requesthandlers-and-searchcomponents-in-solrconfig.adoc#requesthandlers-and-searchcomponents-in-solrconfig,SearchComponent>>::
 Logic and configuration parameters used by request handlers to process query requests. Examples of search components include faceting, highlighting, and "more like this" functionality.
@@ -166,7 +167,7 @@ Functionality in Solr that provides the ability to suggest possible query terms
 Synonyms generally are terms which are near to each other in meaning and may substitute for one another. In a search engine implementation, synonyms may be abbreviations as well as words, or terms that are not consistently hyphenated. Examples of synonyms in this context would be "Inc." and "Incorporated" or "iPod" and "i-pod".
 
 [[SolrGlossary-T]]
-== T
+=== T
 
 [[termfrequency]]Term frequency::
 The number of times a word occurs in a given document. See http://en.wikipedia.org/wiki/Tf-idf and {lucene-javadocs}/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html[the Lucene TFIDFSimilarity javadocs] for more info on TF-IDF based scoring and Lucene scoring in particular. See also <<idf,Inverse document frequency (IDF)>>.
@@ -175,13 +176,13 @@ The number of times a word occurs in a given document. See http://en.wikipedia.o
 An append-only log of write operations maintained by each <<replica,Replica>>. This log is required with SolrCloud implementations and is created and managed automatically by Solr.
 
 [[SolrGlossary-W]]
-== W
+=== W
 
 [[wildcard]]Wildcard::
 A wildcard allows a substitution of one or more letters of a word to account for possible variations in spelling or tenses.
 
 [[SolrGlossary-Z]]
-== Z
+=== Z
 
 [[zookeeper]]ZooKeeper::
 Also known as http://zookeeper.apache.org/[Apache ZooKeeper]. The system used by SolrCloud to keep track of configuration files and node names for a cluster. A ZooKeeper cluster is used as the central configuration store for the cluster, a coordinator for operations requiring distributed synchronization, and the system of record for cluster topology. See also <<solrclouddef,SolrCloud>>.

[2/2] lucene-solr:jira/solr-10290: SOLR-10290: remove confluence-export dir with tools for conversion

Posted by ct...@apache.org.

SOLR-10290: remove confluence-export dir with tools for conversion


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/96058f82
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/96058f82
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/96058f82

Branch: refs/heads/jira/solr-10290
Commit: 96058f824cdb75e61438c0f6e14085f91f6971a5
Parents: 07d8179
Author: Cassandra Targett <ct...@apache.org>
Authored: Tue May 9 14:50:17 2017 -0500
Committer: Cassandra Targett <ct...@apache.org>
Committed: Tue May 9 14:50:17 2017 -0500

----------------------------------------------------------------------
 .../conversion-tools/custom.pandoc.template     |  38 --
 .../conversion-tools/jsoup/.gitignore           |   1 -
 .../conversion-tools/jsoup/build.xml            |  91 ---
 .../lucidworks/docparser/HtmlFileFilter.java    |  10 -
 .../lucidworks/docparser/ScrapeConfluence.java  | 645 -------------------
 .../conversion-tools/page-hierarchy.xsl         |  81 ---
 .../conversion-tools/post-process-adocs.pl      |  39 --
 .../conversion-tools/toAsciidoc.sh              |  53 --
 8 files changed, 958 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/custom.pandoc.template
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/custom.pandoc.template b/solr/confluence-export/conversion-tools/custom.pandoc.template
deleted file mode 100644
index 5993767..0000000
--- a/solr/confluence-export/conversion-tools/custom.pandoc.template
+++ /dev/null
@@ -1,38 +0,0 @@
-$if(titleblock)$
-$title$
-$for(author)$
-:author: $author$
-$endfor$
-$if(date)$
-:date: $date$
-$endif$
-$if(toc)$
-:toc:
-$endif$
-$if(page-shortname)$
-:page-shortname: $page-shortname$
-$endif$
-$if(page-permalink)$
-:page-permalink: $page-permalink$
-$endif$
-$if(page-tags)$
-:page-tags: $for(page-tags)$[$page-tags$]$sep$, $endfor$
-$endif$
-$if(page-children)$
-:page-children: $for(page-children)$$page-children$$sep$, $endfor$
-$endif$
-
-$endif$
-$for(header-includes)$
-$header-includes$
-
-$endfor$
-$for(include-before)$
-$include-before$
-
-$endfor$
-$body$
-$for(include-after)$
-
-$include-after$
-$endfor$

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/jsoup/.gitignore
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/.gitignore b/solr/confluence-export/conversion-tools/jsoup/.gitignore
deleted file mode 100644
index 378eac2..0000000
--- a/solr/confluence-export/conversion-tools/jsoup/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-build

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/jsoup/build.xml
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/build.xml b/solr/confluence-export/conversion-tools/jsoup/build.xml
deleted file mode 100644
index d7cc560..0000000
--- a/solr/confluence-export/conversion-tools/jsoup/build.xml
+++ /dev/null
@@ -1,91 +0,0 @@
-<?xml version="1.0" encoding="ASCII"?>
-<project>
-
-  <property name="version"
-            value="1.0"/>
-
-  <property name="jar"
-            value="build/parsers-${version}.jar"/>
-
-  <path id="classpath">
-    <pathelement location="${jar}"/>
-    <pathelement location="lib/jsoup-1.8.2.jar"/>
-  </path>
-
-  <target name="clean">
-    <delete dir="build"/>
-  </target>
-
-  <target name="jar">
-    <mkdir dir="build/classes"/>
-    <javac debug="yes"
-           debuglevel="source,lines,vars"
-           destdir="build/classes"
-           includeantruntime="false">
-      <compilerarg value="-Xlint:all"/>
-      <classpath refid="classpath"/>
-      <src path="src/"/>
-    </javac>
-    <jar destfile="${jar}">
-      <fileset dir="build/classes"
-               includes="**/*.class"/>
-    </jar>
-  </target>
-
-  <property name="work.dir" location="../../"/>
-  <property name="from.dir" location="${work.dir}/raw-export"/>
-  <property name="cleaned.dir" location="${work.dir}/cleaned-export"/>
-  <property name="entities.xml.path" location="${work.dir}/raw-xml-export/entities.xml"/>
-  <property name="page-tree.xml.path" location="${work.dir}/page-tree.xml"/>
-
-  <target name="-dir-check">
-    <fail message="Raw (HTML) confluence export dir does not exist: ${from.dir}">
-      <condition>
-        <not>
-          <available file="${from.dir}" />
-        </not>
-      </condition>
-    </fail>
-    <fail message="Can't find entities.xml in raw (XML) confluence export dir: ${entities.xml.path}">
-      <condition>
-        <not>
-          <available file="${entities.xml.path}" />
-        </not>
-      </condition>
-    </fail>
-  </target>
-  
-  <target name="-page-tree-check">
-    <uptodate property="page-tree.xml.uptodate"
-              srcfile="${entities.xml.path}"
-              targetfile="${page-tree.xml.path}"/>
-  </target>
-  
-  <target name="page-tree"
-          depends="-dir-check,-page-tree-check"
-          unless="${page-tree.xml.uptodate}">
-    <xslt in="${entities.xml.path}" out="${page-tree.xml.path}"
-          style="../page-hierarchy.xsl" />
-  </target>
-  
-  <target name="scrape"
-          depends="-dir-check,jar,page-tree">
-    <delete dir="${cleaned.dir}" />
-    <mkdir dir="${cleaned.dir}"/>
-    <mkdir dir="${cleaned.dir}/images"/>
-    <!-- any "images" in the raw dump are icons that are definitely used by some page
-         (the export code already filtered from the set of all images in confluence) -->
-    <copy todir="${cleaned.dir}/images">
-      <fileset dir="${from.dir}/images" />
-    </copy>
-    <java classname="com.lucidworks.docparser.ScrapeConfluence"
-          failonerror="true"
-          fork="true">
-      <classpath refid="classpath"/>
-      <arg value="${from.dir}"/>
-      <arg value="${page-tree.xml.path}"/>
-      <arg value="${cleaned.dir}"/>
-    </java>
-  </target>
-
-</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
deleted file mode 100644
index 9bc3b8c..0000000
--- a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
+++ /dev/null
@@ -1,10 +0,0 @@
-package com.lucidworks.docparser;
-
-import java.io.File;
-import java.io.FileFilter;
-
-public class HtmlFileFilter implements FileFilter {
-    public boolean accept(File pathname) {
-        return pathname.getName().toLowerCase().endsWith("htm") || pathname.getName().toLowerCase().endsWith("html");
-    }
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
deleted file mode 100644
index 5497883..0000000
--- a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
+++ /dev/null
@@ -1,645 +0,0 @@
-package com.lucidworks.docparser;
-
-import java.io.*;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.nio.file.Files;
-import java.util.Arrays;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Locale;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import org.jsoup.Jsoup;
-import org.jsoup.nodes.Document;
-import org.jsoup.nodes.Element;
-import org.jsoup.nodes.Node;
-import org.jsoup.nodes.TextNode;
-import org.jsoup.parser.Parser;
-import org.jsoup.parser.Tag;
-import org.jsoup.select.Elements;
-import org.jsoup.select.NodeVisitor;
-
-/**  
- * Extract body of Confluence page using Jsoup library.
- * This creates an identical (flat) directory structured containing cleaned up documents
- */
-public class ScrapeConfluence {
-  static final Pattern PRE_CODE_CLASS_PATTERN = Pattern.compile("brush:\\s+([^;]+)");
-  static final Pattern ANCHOR_ID_CLEANER = Pattern.compile("[^A-Za-z0-9\\.\\-\\_\\#]+");
-  static final Pattern LEADING_SPACE_PATTERN = Pattern.compile("\\A\\s+");
-  static final Pattern TRAILING_SPACE_PATTERN = Pattern.compile("\\s+\\Z");
-  static final Pattern ONLY_SPACE_PATTERN = Pattern.compile("\\A\\s*\\Z");
-  static final Pattern JAVADOC_URL_PATH_PATTERN = Pattern.compile("/(solr|core)/\\d+_\\d+_\\d+(/.*)");
-  
-    public static void main(String[] args) throws Exception {
-        if (args.length < 3) {
-            System.err.println("usage: ScrapeConfluence "
-                               + "<indir> <page-tree.xml> <outdir>");
-            System.exit(-1);
-        }
-        File inputDir = new File(args[0]);
-        File pageTreeXmlFile = new File(args[1]);
-        PageTree pageTree = new PageTree(pageTreeXmlFile);
-        File outputDir = new File(args[2]);
-        File imagesDir = new File(outputDir, "images");
-        if (! (imagesDir.exists() || imagesDir.mkdirs() ) ) {
-          throw new RuntimeException("Unable to create images dir: " + imagesDir.toString());
-        }
-        
-        HtmlFileFilter htmlFilter = new HtmlFileFilter();
-        File[] pages = inputDir.listFiles(htmlFilter);
-        for (File page : pages) {
-            if (page.getName().equals("index.html")) {
-              // we don't need/want you
-              // although i really wish i'd realized this page was i nthe HTML export before
-              // i did all that work to build page-tree.xml from the XML export
-              continue;
-            }
-          
-            System.out.println("input Page URI: " + page.toURI().toString());
-            final Element pageTreePage = pageTree.getPage(page.toURI().toString());
-            final String pageName = pageTree.getPageShortName(pageTreePage);
-            final String title = pageTree.getPageTitle(pageTreePage);
-            final String permalink = pageName + ".html";
-            final File outPage = new File(outputDir, permalink);
-            System.out.println("outPage URI: " + outPage.toURI().toString());
-            
-            if (outPage.exists()) {
-              throw new RuntimeException(permalink + " already exists - multiple files with same shortname: " + page + " => " + outPage);
-            }
-
-            // Confluence encodes &nbsp; as 0xa0.
-            // JSoup API doesn't handle this - change to space before parsing Document
-            String fileContents = readFile(page.getPath());
-            fileContents = fileContents.replace('\u00a0',' ');
-
-            // parse Confluence page
-            Document doc = Jsoup.parse(fileContents);
-            Element mainContent = doc.select("#main-content").first();
-            if (mainContent == null) {
-              throw new RuntimeException(page.getName() + " has no main-content div");
-            }
-            
-            // create clean HTML page
-            Document docOut = Document.createShell(outPage.toURI().toString());
-            docOut.title(title);
-
-            addMetadata(docOut, "page-shortname", pageName);
-            addMetadata(docOut, "page-permalink", permalink);
-            for (Element kid : pageTreePage.children()) {
-              addMetadata(docOut, "page-children", pageTree.getPageShortName(kid));
-            }
-
-            
-            docOut.body().appendChild(mainContent);
-            docOut.normalise();
-
-            cleanupContent(docOut);
-
-            // fix links
-            Elements elements = docOut.select("a[href]");
-            for (Element element : elements) {
-              element.attr("href", fixLink(page, pageTree, element.attr("href")));
-            }
-            
-            // fix (and copy) images
-            for (Element element : docOut.select("img")) {
-              String src = element.attr("src");
-              // attachments can be referenced by other pages
-              String imagePageId = element.attr("data-linked-resource-container-id");
-              String filename = element.attr("data-linked-resource-default-alias");
-              if (null == imagePageId || null == filename ||
-                  "".equals(imagePageId) || "".equals(filename)) {
-                // this some standard comfluence image, not an attacment
-                // assume it's already been copied into place, and leave the src attr alone
-                continue;
-              }
-              String imagePageShortName = pageTree.getPageShortName(pageTree.getPage
-                                                                    (Integer.valueOf(imagePageId)));
-              
-              // copy the file to the desired path if we haven't already...
-              File imagePageDir = new File(imagesDir, imagePageShortName);
-              File imageFile = new File(imagePageDir, filename);
-              if (! imageFile.exists()) {
-                File origImageFile = new File(inputDir, src);
-                if (! origImageFile.exists()) {
-                  throw new RuntimeException("unable to find image: " + origImageFile + " for img in " +
-                                             page.toString());
-                }
-                if (! (imagePageDir.exists() || imagePageDir.mkdirs() ) ) {
-                  throw new RuntimeException("unable to makedirs: " + imagePageDir + " for img: " + src +
-                                             " in " + page.toString());
-                }
-                Files.copy(origImageFile.toPath(), imageFile.toPath());
-              }
-              
-              // rewrite the src attribute
-              element.attr("src", "images/" + imagePageShortName + "/" + filename);
-              // put each image in it's own paragragh (block type elements in adoc)
-              element.wrap("<p></p>");
-            }
-
-            // TODO: need to look for non image attachments and copy them as well
-            // ie: SVG files used to create some of these images
-            
-            docOut.normalise();
-            OutputStream out = new FileOutputStream(outPage);
-            Writer writer = new OutputStreamWriter(out,"UTF-8");
-            BufferedWriter bufWriter = new BufferedWriter(writer);
-            bufWriter.write(docOut.toString());
-            bufWriter.write("\n");
-            bufWriter.close();
-            writer.close();
-            out.close();
-        }
-    }
-
-    static String readFile(String fileName) throws IOException {
-        InputStream in = new FileInputStream(fileName);
-        Reader reader = new InputStreamReader(in,"UTF-8");
-        BufferedReader br = new BufferedReader(reader);
-        try {
-            StringBuilder sb = new StringBuilder();
-            String line = br.readLine();
-            while (line != null) {
-                sb.append(line);
-                sb.append("\n");
-                line = br.readLine();
-            }
-            return sb.toString();
-        } finally {
-            br.close();
-        }
-    }
-
-  static String fixLink(File page, PageTree pageTree, final String href) {
-    try {
-      URI uri = new URI(href);
-      if (uri.isAbsolute()) {
-        // check if it's a javadoc URL and if so update to use our adoc attribute
-        final Matcher matcher = JAVADOC_URL_PATH_PATTERN.matcher(uri.getPath());
-        if (uri.getHost().equals("lucene.apache.org") && matcher.matches()) {
-          String path = matcher.group(2);
-          return (matcher.group(1).equals("core") ? "{lucene-javadocs}" : "{solr-javadocs}") + path;
-        } else {
-          return href;
-        }
-      }
-      // else: not an absoulte URL...
-      
-      // any relative URL will get 'REL_LINK//' prepended so we can post-process
-      // the .adoc files to convert from the "link:xxx" syntax to the <<xxx>> syntax
-      // since pandoc doesn't have native support for that.
-      final String PRE = "REL_LINK//";
-      
-      String path = uri.getPath(); 
-      Element linkedPage = pageTree.getPageIfMatch(path);
-      
-      if ("".equals(path)) { // fragment only URL (ie: same page)
-        return PRE + fixAnchorId(href);
-      } else if (null != linkedPage) {
-        final String linkShortName = pageTree.getPageShortName(linkedPage);
-        path = linkShortName + ".adoc";
-
-        String frag = uri.getFragment();
-        if (null == frag || "".equals(frag)) {
-          // we have to have a fragment for intra-page links to work correctly in asciidoc
-          frag = linkShortName;
-        }
-        frag = fixAnchorId(frag);
-        
-        // HACKish, to ensure we get clean path + ?query? + fragement
-        // (assuming we have any query parts in our realtive urls to worry about)
-        String fixed = new URI(null, null, path, uri.getQuery(), frag).toString();
-        return PRE + fixed;
-        
-      } // else: no idea what this is...
-
-      System.err.println("found odd rel link: " + href + " in " + page.toString());
-      return PRE + href;
-
-      
-    } catch (URISyntaxException se) {
-      System.err.println("found malformed URI " + href + " in " + page.toString());
-      // assume we should leave it alone...
-      return href;
-    }
-
-  }
-  
-  static void addMetadata(Document docOut, String name, String content) {
-      Element meta = new Element(Tag.valueOf("meta"),".");
-      meta.attr("name", name);
-      meta.attr("content", content);
-      docOut.head().appendChild(meta);
-  }
-  
-  
-  static void cleanupContent(Document docOut) {
-    // start cleanup
-    Elements elements = null;
-    
-    // remove side panels (page-internal ToCs)
-    Element sideBar = docOut.select("[data-type=aside]").first();
-    if (null == sideBar) {
-      // sometimes they aren't an 'aside', they are columns cotaining panels
-      elements = docOut.select("div.columnMacro");
-      for (Element element : elements) {
-        if (! element.select("div.toc-macro").isEmpty()) {
-          sideBar = element;
-          break;
-        }
-      }
-    }
-    if (null == sideBar) {
-      // final scnereo: toc by itself in the page body...
-      elements = docOut.select("div.toc-macro");
-      for (Element element : elements) {
-        if (! element.select("div.toc-macro").isEmpty()) {
-          sideBar = element;
-          break;
-        }
-      }
-    }
-    if (sideBar != null) {
-      // TODO: this currently replaces the entire aside/column/panel if there was one...
-      // ...would it be better to leave the other panel text and only remove the div.toc-macro?
-      //  examples:
-      //    Covered in this section:
-      //    Topics covered in this section:
-      //    Filters discussed in this section:
-      //    Algorithms discussed in this section:
-
-      // NOTE: conciously choosing to completely remove the TOC, instead of adding any metadata/macros to it
-      // let the page presentation decide if/when to use a TOC...
-      //
-      sideBar.remove();
-      // sideBar.replaceWith(new TextNode("toc::[]",""));
-      // addMetadata(docOut, "toc", "true");
-      
-
-    } else {
-      // sanity check if we missed any (multiple TOCs on a page?) ...
-      elements = docOut.select("div.toc-macro");
-      if (! elements.isEmpty()) {
-        System.out.println("MISSED A TOC: " + elements.toString());
-        System.exit(-1);
-      }
-    }
-    
-    // unwrap various formatting tags if they are empty
-    // NOTE: explicitly not doing 'span' here because it might be used as an anchor
-    elements = docOut.select("strong, em, p, code, pre, span:not([id])");
-    for (Element element : elements) {
-      if (!element.hasText()) {
-        element.unwrap(); // unwrap not remove! (even w/o text might be inner nodes, ex: img)
-      }
-    }
-
-    // these spans aren't particularly problematic, and will largely be ignored by pandoc either way
-    // but by removing them here, they simplify some of the logic we need in other cleanup later
-    // (notably when looking for tags inside of code)
-    elements = docOut.select("span.external-link, span.nolink, span.confluence-link, code span:not([id])");
-    for (Element element : elements) {
-      element.unwrap();
-    }
-    
-    // move any leading/trailing space from the leading/trailing textNodes of formatting tags
-    // out of the tags
-    // (completley removing it is dangerous because it might create run on "words")
-    for (String tag : Arrays.asList("span", "strong", "em", "code", "p")) { 
-      elements = docOut.getElementsByTag(tag);
-      for (Element element : elements) {
-        // Note: not using textNodes() because our first text node may not be our first child,
-        // we don't want to munge spaces from the middle of our html if it just happens to be the
-        // first direct TextNode 
-        List<Node> kids = element.childNodes();
-        if (! kids.isEmpty()) {
-          if (kids.get(0) instanceof TextNode) {
-            TextNode t = (TextNode) kids.get(0);
-            Matcher m = LEADING_SPACE_PATTERN.matcher(t.text());
-            if (m.matches()) {
-              t.text(m.replaceAll(""));
-              element.before(" ");
-            }
-          }
-          if (kids.get(kids.size()-1) instanceof TextNode) {
-            TextNode t = (TextNode) kids.get(kids.size()-1);
-            Matcher m = TRAILING_SPACE_PATTERN.matcher(t.text());
-            if (m.matches()) {
-              t.text(m.replaceAll(""));
-              element.after(" ");
-            }
-          }
-        }
-      }
-    }
-
-    // this is totally bogus, and yet confluence is doing this...
-    elements = docOut.select("code code");
-    for (Element element : elements) {
-      element.unwrap();
-    }
-    
-    // fake out pandoc when an em or strong tag is inside of a code tag
-    elements = docOut.select("code strong");
-    for (Element element : elements) {
-      element.prependText("**");
-      element.appendText("**");
-      element.unwrap();
-    }
-    elements = docOut.select("code em");
-    for (Element element : elements) {
-      element.prependText("__");
-      element.appendText("__");
-      element.unwrap();
-    }
-
-    // in asciidoc, links can wrap code, but code can not wrap links
-    // so we need to invert the relationship if/when we find it...
-    elements = docOut.select("code > a:only-child");
-    for (Element element : elements) {
-      Element code = element.parent();
-      String href= element.attr("href");
-      element.unwrap();
-      if (! href.equals(code.text())) {
-        // if the entire code block is a URL, we don't need to wrap it in another link
-        // asciidoctor will take care of that for us.
-        code.wrap("<a href=\""+href+"\"></a>");
-      }
-    }
-    
-    // remove confluence styles
-    elements = docOut.select("[style]");
-    for (Element element : elements) {
-      element.removeAttr("style");
-    }
-    // remove confluence themes from <pre> tags
-    elements = docOut.getElementsByTag("pre");
-    for (Element element : elements) {
-      if (element.hasAttr("class")) {
-        Matcher codeType = PRE_CODE_CLASS_PATTERN.matcher(element.attr("class"));
-        if (codeType.find()) {
-          String codeClass = codeType.group(1);
-          // some munging needed in some cases...
-          if (codeClass.equals("html/xml")) {
-            codeClass = "xml";
-          }
-          if (codeClass.equals("js")) {
-            // almost no javascript in ref guide, assume it should really be json
-            codeClass = "json";
-          }
-          if (element.text().startsWith("curl ")) {
-            // if this looks like a curl command, then ignore whatever class might have been in
-            // confluence and treat it as bash
-            codeClass = "bash";
-          }
-          // TODO: other values we should also change here? "powershell" ?
-          element.attr("class", codeClass);
-        } else {
-          element.removeAttr("class");
-        }
-      }
-    }
-
-    // confluence has a nasty habbit of (sometimes) putting named anchors people explicitly define
-    // *inside* a header, instead of around/before it.
-    // so if we find any of these, we need to rearange some things to work around somr problems...
-    // https://github.com/asciidoctor/asciidoctor/issues/1875
-    // 
-    // NOTE: just moving an explicit anchor before the header should work, but because of how id's on headers
-    // are treated in asciidoc, and some weirdness in how asciidoctor treats multiple anchors
-    // delcared in front of a header this causes all but one of the anchors to be ignored...
-    //
-    // https://github.com/asciidoctor/asciidoctor/issues/1874
-    //
-    // because of this, we'll use the "explicitly" defined ancor macro from confluence as our "main"
-    // id for the header, and move the existing header id to it's own declaration.
-    //
-    // that should result in both still existing in the final adoc file (so they are easy to grep for)
-    // but the one that is most likely to have links to it will be the one used by default in generated html.
-    for (int level = 1; level < 7; level++) {
-      final String h = "h" + level;
-      elements = docOut.getElementsByTag(h);
-      for (Element header : elements) {
-        // first see if we are immediately preceeded by an explicit anchor macro...
-        // (any wrapping <p> tags should have already been uprapped for us)
-        Element previous = header.previousElementSibling();
-        if (null != previous && "span".equals(previous.tagName()) && previous.classNames().contains("confluence-anchor-link")) {
-          // swap the id from this "previous" macro declaration with the "id" of the as our header
-          final String oldId = header.attr("id");
-          header.attr("id", previous.attr("id"));
-          previous.attr("id", oldId);
-        }
-          
-        // next, look for any anchors declared inside the header...
-        Elements inner = header.getElementsByClass("confluence-anchor-link");
-        for (Element anchor : inner) {
-          final String oldId = header.attr("id");
-          header.attr("id", anchor.attr("id"));
-          if (null != oldId) {
-            // flip id and move the anchor before the header
-            anchor.attr("id", oldId);
-            header.before(anchor);
-          } else {
-            // just remove the anchor completley
-            // (don't think this code path is possible, but including for completeness)
-            anchor.remove();
-          }
-        }
-      }
-    }
-    
-    // replace icon text
-    elements = docOut.getElementsByClass("confluence-information-macro");
-    for (Element element : elements) {
-      final String admonishment = getAdmonishment(element);
-      Elements titles = element.select(".title");
-      if (1 < titles.size()) {
-        System.err.println("admonishment macro has more then 1 title: " + element.outerHtml());
-        System.exit(-1);
-      }
-
-      // it's easier to post-process this, then to try and fight the html->pandoc->adoc conversion
-      for (Element title : titles) { // only one, loop is easy
-        title.prependText("TODO_ADMON_TITLE:");
-        element.before(title); // move it before the block
-      }
-      element.prependChild((new Element(Tag.valueOf("p"), ".")).prependText("[" + admonishment + "]===="));
-      element.appendChild((new Element(Tag.valueOf("p"), ".")).prependText("===="));
-    }
-
-    // unwrap various block tags if they are empty
-    for (String tag : Arrays.asList("div","tbody")) {
-      elements = docOut.getElementsByTag(tag);
-      for (Element element : elements) {
-        element.unwrap(); // unwrap not remove! (might be inner nodes, ex: img)
-      }
-    }
-    
-    // remove breaks -- TODO: why?
-    elements = docOut.getElementsByTag("br");
-    for (Element element : elements) {
-      element.remove();
-    }
-
-    // work around https://github.com/asciidoctor/asciidoctor/issues/1873
-    elements = docOut.select("[id]");
-    for (Element element : elements) {
-      final String oldId = element.attr("id");
-      final String newId = fixAnchorId(oldId);
-      if (! oldId.equals(newId)) {
-        // would love to use jsoup's Comment class, but it doesn't survive pandoc
-        // ironically, this does...
-        Element fakeComment = new Element(Tag.valueOf("div"), "");
-        fakeComment.text("// OLD_CONFLUENCE_ID: " + oldId);
-        element.before(fakeComment);
-        element.attr("id", newId);
-      }
-    }
-
-    // pandoc gets really confused when <ol>s get nested, add a comment pointing out
-    // manual cleanup is needed
-    elements = docOut.select("ol:has(ol, ul), ul:has(ol)");
-    LIST: for (Element list : elements) {
-      // if we are wrapped in an outer list, nothing to do - already done at top level
-      for (Element parent : list.parents()) {
-        if ("ol".equals(parent.tagName()) || "ul".equals(parent.tagName())) {
-          continue LIST;
-        }
-      }
-      // would love to use jsoup's Comment class, but it doesn't survive pandoc
-      // ironically, this does...
-      Element fakeComment = new Element(Tag.valueOf("div"), "");
-      fakeComment.text("// TODO: This '"+list.tagName()+"' has problematic nested lists inside of it, needs manual editing");
-      list.before(fakeComment);
-    }
-
-    // table cells containing structural elements are problematic in PDFs...
-    elements = docOut.select("table:has(ol, ul, p ~ p, div, pre, table)");
-    TABLE: for (Element table : elements) {
-      // if we are wrapped in another table, nothing to do - already done at top level
-      for (Element parent : table.parents()) {
-        if ("table".equals(parent.tagName())) {
-          continue TABLE;
-        }
-      }
-      // would love to use jsoup's Comment class, but it doesn't survive pandoc
-      // ironically, this does...
-      Element fakeComment = new Element(Tag.valueOf("div"), "");
-      fakeComment.text("// TODO: This table has cells that won't work with PDF: https://github.com/ctargett/refguide-asciidoc-poc/issues/13");
-      table.before(fakeComment);
-    }
-
-    // final modification: get rid of any leading spaces in paragraphs
-    // (otherwise asciidoctor will treat them as a type of code formatting
-    elements = docOut.select("p > span:not([id]):first-child");
-    for (Element element : elements) {
-      if (ONLY_SPACE_PATTERN.matcher(element.html()).matches()) {
-        element.remove();
-      }
-    }
-    
-    // in general, pandoc/asciidoctor has problems with tags inside of "code" so log if we have anything
-    elements = docOut.select("code:has(*)");
-    for (Element element : elements) {
-      System.out.println("NOTE: code tag w/nested tags: " + element.outerHtml());
-    }
-      
-    
-    docOut.normalise();
-  }
-
-  /** 
-   * work around https://github.com/asciidoctor/asciidoctor/issues/1873
-   * needs to be called on all "id" attributes, as well as any anchor text in (local) links
-   */
-  public static String fixAnchorId(String id) {
-    Matcher m = ANCHOR_ID_CLEANER.matcher(id);
-    return m.replaceAll("_");
-  }
-
-  /**
-   * convert confluence admonishment macor types to the "equivilent" adoc types we want to use
-   */
-  public static String getAdmonishment(Element e) {
-    String admon = null;
-    if (e.hasClass("confluence-information-macro-information")) {
-      return "NOTE";
-    }
-    if (e.hasClass("confluence-information-macro-tip")) {
-      return "TIP";
-    }
-    if (e.hasClass("confluence-information-macro-note")) {
-      return "IMPORTANT";
-    }
-    if (e.hasClass("confluence-information-macro-warning")) {
-      return "WARNING";
-    }
-    System.err.println("No admonishment mapping for: " + e.outerHtml());
-    System.exit(-1);
-    return null;
-  }
-  
-  /**
-   * Wraps a (Jsoup) "DOM" of the <code>page-tree.xml</code> file with convinience methods
-   * for getting the names, shortnames, and kids of various pages
-   */
-  private static final class PageTree {
-    private static final Pattern HTML_EXPORT_FILENAME = Pattern.compile("^.*?\\D?(\\d+)\\.html$");
-    private static final Pattern SHORT_NAME_CLEANER = Pattern.compile("[^a-z0-9]+");
-    // Jsoups XML parsing is easier to work with then javax, especially getById
-    private final Document dom;
-    public PageTree(File pageTreeXml) throws Exception {
-      try (FileInputStream fis = new FileInputStream(pageTreeXml)) {
-        this.dom = Jsoup.parse(fis, null, pageTreeXml.toURI().toString(), Parser.xmlParser());
-      }
-    }
-    public Element getPage(int id) {
-      final Element ele = dom.getElementById(""+id);
-      if (null == ele) {
-        throw new NullPointerException("can't find DOM element with id: " + id);
-      }
-      return ele;
-    }
-    public Element getPage(String htmlFilePath) {
-      Element page = getPageIfMatch(htmlFilePath);
-      if (null != page) {
-        return page;
-      } // else...
-      throw new RuntimeException("Can't match page path pattern for html path: " + htmlFilePath);
-    }
-    public Element getPageIfMatch(String htmlFilePath) {
-      if (null == htmlFilePath || 0 == htmlFilePath.length()) {
-        return null;
-      }
-      Matcher m = HTML_EXPORT_FILENAME.matcher(htmlFilePath);
-      if (m.matches()) {
-        int id = Integer.valueOf(m.group(1));
-        return getPage(id);
-      } // else...
-      return null;
-    }
-    public String getPageTitle(Element page) {
-      String title = page.attr("title");
-      if (null == title) {
-        throw new NullPointerException("Page has null title attr");
-      }
-      return title;
-    }
-    public String getPageShortName(Element page) {
-      Matcher m = SHORT_NAME_CLEANER.matcher(getPageTitle(page).toLowerCase(Locale.ROOT));
-      return m.replaceAll("-");
-    }
-    public String getPageShortName(String htmlFilePath) {
-      return getPageShortName(getPage(htmlFilePath));
-    }
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/page-hierarchy.xsl
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/page-hierarchy.xsl b/solr/confluence-export/conversion-tools/page-hierarchy.xsl
deleted file mode 100644
index 39cefce..0000000
--- a/solr/confluence-export/conversion-tools/page-hierarchy.xsl
+++ /dev/null
@@ -1,81 +0,0 @@
-<xsl:stylesheet version="1.0"
-                xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-  <!-- A style sheet that can be applied to entities.xml from a Confluence dump
-       and produces just the bare bones data about the hierarchy of pages in (the|each) space,
-       in the order they appear as children of their parent
-  -->
-  
-  <xsl:output indent="yes"/>
-  
-  <xsl:template match="/">
-    <confluence>
-      <xsl:apply-templates select="//object[@class='Space']"/>
-    </confluence>
-  </xsl:template>
-  
-  <xsl:template match="object[@class='Space']">
-    <space>
-      <xsl:attribute name="id"><xsl:value-of select="./id/text()"/></xsl:attribute>
-      <xsl:attribute name="name"><xsl:value-of select="./property[@name='name']/text()"/></xsl:attribute>
-
-      <!-- can't just look for "pages that have no parent" because that will also match old versions.
-           (even the historical versions of pages have a status of 'current')
-      -->
-      <!--
-          So instead look for any page that is part of the space, and does have some position
-          (sort order in space), but does not have a parent
-      -->
-      <xsl:apply-templates select="//object[@class='Page'][boolean(property[@name='position']/text())][not(property[@name='parent'])][property[@name='space']/id/text()=current()/id/text()]" >
-        <!-- NOTE: sort duplicated in recursive Page template below -->
-        <xsl:sort data-type="number" order="ascending"
-                  select="property[@name='position']/text()" />
-        <!-- aparently pages only have position if user has explicitly sorted?
-             otherwise it looks like they default to sort by title? -->
-        <xsl:sort data-type="text" order="ascending"
-                  select="property[@name='title']/text()" />
-      </xsl:apply-templates>
-    </space>
-  </xsl:template>
-
-  <!-- NOTE: This template is recursive -->
-  <xsl:template match="object[@class='Page']">
-    <page>
-      <xsl:attribute name="id"><xsl:value-of select="./id/text()"/></xsl:attribute>
-      <xsl:attribute name="title"><xsl:value-of select="./property[@name='title']/text()"/></xsl:attribute>
-      <!-- add parent info redundently in case it's helpful -->
-      <xsl:if test="./property[@name='parent']/id">
-        <xsl:attribute name="parent"><xsl:value-of select="./property[@name='parent']/id/text()"/></xsl:attribute>
-      </xsl:if>
-
-      <!-- the sort order if explicitly set by a confluence user at some point
-           If this has never been set for a group of children, it aparently defaults to
-           sorting all those children by alpha page title
-      -->
-      <xsl:if test="./property[@name='position']/node()">
-        <xsl:attribute name="sort"><xsl:value-of select="./property[@name='position']/text()"/></xsl:attribute>
-      </xsl:if>
-      
-      <!-- NOTE: doing a for-each on collection[@name='children'] won't work....
-           collection isn't sorted, need to use "position" property from the Pages themselves
-           
-           <xsl:for-each select="collection[@name='children']/element[@class='Page']/id/text()">
-           <xsl:apply-templates select="//object[@class='Page'][id/text()=current()]"/>
-           </xsl:for-each>
-      -->
-      
-      <!-- instead we go out and select every page that has a parent which matches our id
-           (thank god for the parent property) and (recursively) apply templates in "position" sorted order
-      -->
-      <xsl:apply-templates select="//object[@class='Page'][property[@name='parent']/id/text()=current()/id/text()]">
-        <!-- NOTE: sort duplicated in Space template above -->
-        <xsl:sort data-type="number" order="ascending"
-                  select="property[@name='position']/text()" />
-        <!-- aparently pages only have position if user has explicitly sorted?
-             otherwise it looks like they default to sort by title? -->
-        <xsl:sort data-type="text" order="ascending"
-                  select="property[@name='title']/text()" />
-      </xsl:apply-templates>
-    </page>
-  </xsl:template>
-  <xsl:template match="object" /><!-- No-Op for other types of objects -->
-</xsl:stylesheet>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/post-process-adocs.pl
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/post-process-adocs.pl b/solr/confluence-export/conversion-tools/post-process-adocs.pl
deleted file mode 100755
index c471558..0000000
--- a/solr/confluence-export/conversion-tools/post-process-adocs.pl
+++ /dev/null
@@ -1,39 +0,0 @@
-#!perl -i
-
-use strict;
-use warnings;
-
-while (my $line = <>) {
-    # pandoc uses '=========...' syntax for doc title, we want shorter "= TITLE" syntax
-    if (1 == $.) {
-	$line = "= $line";
-    } elsif ((2 == $.) && $line =~ /^=+$/) {
-	next; # skip this line completley
-    }
-
-    # table syntax doesn't need to be so verbose
-    $line =~ s{^\|\={3,}+$}{|===};
-	
-    # fix up relative links (in place edit) -- NOTE: links to anchor in same page get '#' stripped
-    $line =~ s{link:REL_LINK//#?(.*?)\[(.*?)\]}{\<\<$1,$2\>\>}g;
-
-    # fix up javadoc links, since pandoc escapes our attribute syntax
-    $line =~ s<link:%7B(.*?)%7D><{$1}>g;
-
-    # switch all images from inline to 'block' (double colon) and put on their own line of the file
-    # TODO: any attributes we want to add to every image?
-    $line =~ s{image:(.*?)\[(.*?)\]}{image::$1\[$2\]\n}g;
-
-    # admonishments...
-    if ($line =~ s{^TODO_ADMON_TITLE:}{.}) {
-	# next line should be blank, trash it
-	my $trash = <>;
-	$trash =~ /^$/ or die "not a blank trash line: $trash";
-    }
-    $line =~ s{^(\[\w+\])====$}{$1\n====};
-
-    # fixup obviously intended quoted code (otherwise "`foo`" just gets curly quoted)
-    $line =~ s{"`(\w+)`"}{"```$1```"}g;
-    
-    print $line;
-}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/96058f82/solr/confluence-export/conversion-tools/toAsciidoc.sh
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/toAsciidoc.sh b/solr/confluence-export/conversion-tools/toAsciidoc.sh
deleted file mode 100755
index e6238be..0000000
--- a/solr/confluence-export/conversion-tools/toAsciidoc.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/bin/bash
-
-# PoC demonstration of complete migration from Confluence
-# script to walk over pagetree of cleaned-up HTML pages from Confluence
-# process html to asciidoc
-# reconvert asciidoc to html
-
-
-# parent dir of script until/unless we move it
-WORK_DIR=$(realpath -L "$(dirname $0)/../")
-
-if [ ! -d $WORK_DIR ]
-then
-    echo "$WORK_DIR does not exist (as a directory)"
-    exit -1
-fi
-
-# check that we have the expected version of pandoc
-PANDOC_VER=`pandoc --version | head -1 | cut -d' ' -f 2 | cut -d'.' -f 1-2`
-if [ $PANDOC_VER != "1.17" ]
-then
-    echo "Only tested with pandoc 1.17, you are using $PANDOC_VER"
-    exit -1
-fi
-
-PANDOC_TEMPLATE="$WORK_DIR/conversion-tools/custom.pandoc.template"
-if [ ! -e $PANDOC_TEMPLATE ]
-then
-    echo "$PANDOC_TEMPLATE does not exist"
-    exit -1
-fi
-
-HTML_DIR="$WORK_DIR/cleaned-export"
-ASCII_DIR="$WORK_DIR/../solr-ref-guide/src"
-
-rm $ASCII_DIR/*.adoc
-
-echo "Coping images..."
-rm -rf $ASCII_DIR/images
-cp -r $HTML_DIR/images $ASCII_DIR/images
-
-for x in `find $HTML_DIR -name "*.html"`
-do
-    echo $x;
-    FNAME=`echo ${x} | sed -e "s#${HTML_DIR}/##"`
-    DIRNAME=$(dirname ${FNAME})
-    mkdir -p "$ASCII_DIR/$DIRNAME"
-    
-    # convert to .asciidoc format using pandoc
-    pandoc $HTML_DIR/$FNAME -f html -t asciidoc -i --parse-raw --wrap=none --standalone --atx-headers --template=$PANDOC_TEMPLATE -o ${ASCII_DIR}/${FNAME%.*}.adoc
-
-    perl "$WORK_DIR/conversion-tools/post-process-adocs.pl" ${ASCII_DIR}/${FNAME%.*}.adoc
-done;