You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@lucene.apache.org by ct...@apache.org on 2017/03/16 16:53:17 UTC

[1/2] lucene-solr:jira/solr-10290: SOLR-10290: Add Confluence conversion tools

Repository: lucene-solr
Updated Branches:
  refs/heads/jira/solr-10290 e825f0a75 -> ec324b294


SOLR-10290: Add Confluence conversion tools


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8736246e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8736246e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8736246e

Branch: refs/heads/jira/solr-10290
Commit: 8736246ee51e8fbf08ee8d08a1dc22cdeba50d97
Parents: e825f0a
Author: Cassandra Targett <ct...@apache.org>
Authored: Thu Mar 16 11:49:12 2017 -0500
Committer: Cassandra Targett <ct...@apache.org>
Committed: Thu Mar 16 11:49:12 2017 -0500

----------------------------------------------------------------------
 .../conversion-tools/custom.pandoc.template     |  38 ++
 .../conversion-tools/jsoup/.gitignore           |   1 +
 .../conversion-tools/jsoup/build.xml            |  91 +++
 .../lucidworks/docparser/HtmlFileFilter.java    |  10 +
 .../lucidworks/docparser/ScrapeConfluence.java  | 645 +++++++++++++++++++
 .../conversion-tools/page-hierarchy.xsl         |  81 +++
 .../conversion-tools/post-process-adocs.pl      |  39 ++
 .../conversion-tools/toAsciidoc.sh              |  53 ++
 8 files changed, 958 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/custom.pandoc.template
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/custom.pandoc.template b/solr/confluence-export/conversion-tools/custom.pandoc.template
new file mode 100644
index 0000000..5993767
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/custom.pandoc.template
@@ -0,0 +1,38 @@
+$if(titleblock)$
+$title$
+$for(author)$
+:author: $author$
+$endfor$
+$if(date)$
+:date: $date$
+$endif$
+$if(toc)$
+:toc:
+$endif$
+$if(page-shortname)$
+:page-shortname: $page-shortname$
+$endif$
+$if(page-permalink)$
+:page-permalink: $page-permalink$
+$endif$
+$if(page-tags)$
+:page-tags: $for(page-tags)$[$page-tags$]$sep$, $endfor$
+$endif$
+$if(page-children)$
+:page-children: $for(page-children)$$page-children$$sep$, $endfor$
+$endif$
+
+$endif$
+$for(header-includes)$
+$header-includes$
+
+$endfor$
+$for(include-before)$
+$include-before$
+
+$endfor$
+$body$
+$for(include-after)$
+
+$include-after$
+$endfor$

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/.gitignore
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/.gitignore b/solr/confluence-export/conversion-tools/jsoup/.gitignore
new file mode 100644
index 0000000..378eac2
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/.gitignore
@@ -0,0 +1 @@
+build

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/build.xml
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/build.xml b/solr/confluence-export/conversion-tools/jsoup/build.xml
new file mode 100644
index 0000000..d7cc560
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/build.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="ASCII"?>
+<project>
+
+  <property name="version"
+            value="1.0"/>
+
+  <property name="jar"
+            value="build/parsers-${version}.jar"/>
+
+  <path id="classpath">
+    <pathelement location="${jar}"/>
+    <pathelement location="lib/jsoup-1.8.2.jar"/>
+  </path>
+
+  <target name="clean">
+    <delete dir="build"/>
+  </target>
+
+  <target name="jar">
+    <mkdir dir="build/classes"/>
+    <javac debug="yes"
+           debuglevel="source,lines,vars"
+           destdir="build/classes"
+           includeantruntime="false">
+      <compilerarg value="-Xlint:all"/>
+      <classpath refid="classpath"/>
+      <src path="src/"/>
+    </javac>
+    <jar destfile="${jar}">
+      <fileset dir="build/classes"
+               includes="**/*.class"/>
+    </jar>
+  </target>
+
+  <property name="work.dir" location="../../"/>
+  <property name="from.dir" location="${work.dir}/raw-export"/>
+  <property name="cleaned.dir" location="${work.dir}/cleaned-export"/>
+  <property name="entities.xml.path" location="${work.dir}/raw-xml-export/entities.xml"/>
+  <property name="page-tree.xml.path" location="${work.dir}/page-tree.xml"/>
+
+  <target name="-dir-check">
+    <fail message="Raw (HTML) confluence export dir does not exist: ${from.dir}">
+      <condition>
+        <not>
+          <available file="${from.dir}" />
+        </not>
+      </condition>
+    </fail>
+    <fail message="Can't find entities.xml in raw (XML) confluence export dir: ${entities.xml.path}">
+      <condition>
+        <not>
+          <available file="${entities.xml.path}" />
+        </not>
+      </condition>
+    </fail>
+  </target>
+  
+  <target name="-page-tree-check">
+    <uptodate property="page-tree.xml.uptodate"
+              srcfile="${entities.xml.path}"
+              targetfile="${page-tree.xml.path}"/>
+  </target>
+  
+  <target name="page-tree"
+          depends="-dir-check,-page-tree-check"
+          unless="${page-tree.xml.uptodate}">
+    <xslt in="${entities.xml.path}" out="${page-tree.xml.path}"
+          style="../page-hierarchy.xsl" />
+  </target>
+  
+  <target name="scrape"
+          depends="-dir-check,jar,page-tree">
+    <delete dir="${cleaned.dir}" />
+    <mkdir dir="${cleaned.dir}"/>
+    <mkdir dir="${cleaned.dir}/images"/>
+    <!-- any "images" in the raw dump are icons that are definitely used by some page
+         (the export code already filtered from the set of all images in confluence) -->
+    <copy todir="${cleaned.dir}/images">
+      <fileset dir="${from.dir}/images" />
+    </copy>
+    <java classname="com.lucidworks.docparser.ScrapeConfluence"
+          failonerror="true"
+          fork="true">
+      <classpath refid="classpath"/>
+      <arg value="${from.dir}"/>
+      <arg value="${page-tree.xml.path}"/>
+      <arg value="${cleaned.dir}"/>
+    </java>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
new file mode 100644
index 0000000..9bc3b8c
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
@@ -0,0 +1,10 @@
+package com.lucidworks.docparser;
+
+import java.io.File;
+import java.io.FileFilter;
+
+public class HtmlFileFilter implements FileFilter {
+    public boolean accept(File pathname) {
+        return pathname.getName().toLowerCase().endsWith("htm") || pathname.getName().toLowerCase().endsWith("html");
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
new file mode 100644
index 0000000..5497883
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
@@ -0,0 +1,645 @@
+package com.lucidworks.docparser;
+
+import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Elements;
+import org.jsoup.select.NodeVisitor;
+
+/**  
+ * Extract body of Confluence page using Jsoup library.
+ * This creates an identical (flat) directory structured containing cleaned up documents
+ */
+public class ScrapeConfluence {
+  static final Pattern PRE_CODE_CLASS_PATTERN = Pattern.compile("brush:\\s+([^;]+)");
+  static final Pattern ANCHOR_ID_CLEANER = Pattern.compile("[^A-Za-z0-9\\.\\-\\_\\#]+");
+  static final Pattern LEADING_SPACE_PATTERN = Pattern.compile("\\A\\s+");
+  static final Pattern TRAILING_SPACE_PATTERN = Pattern.compile("\\s+\\Z");
+  static final Pattern ONLY_SPACE_PATTERN = Pattern.compile("\\A\\s*\\Z");
+  static final Pattern JAVADOC_URL_PATH_PATTERN = Pattern.compile("/(solr|core)/\\d+_\\d+_\\d+(/.*)");
+  
+    public static void main(String[] args) throws Exception {
+        if (args.length < 3) {
+            System.err.println("usage: ScrapeConfluence "
+                               + "<indir> <page-tree.xml> <outdir>");
+            System.exit(-1);
+        }
+        File inputDir = new File(args[0]);
+        File pageTreeXmlFile = new File(args[1]);
+        PageTree pageTree = new PageTree(pageTreeXmlFile);
+        File outputDir = new File(args[2]);
+        File imagesDir = new File(outputDir, "images");
+        if (! (imagesDir.exists() || imagesDir.mkdirs() ) ) {
+          throw new RuntimeException("Unable to create images dir: " + imagesDir.toString());
+        }
+        
+        HtmlFileFilter htmlFilter = new HtmlFileFilter();
+        File[] pages = inputDir.listFiles(htmlFilter);
+        for (File page : pages) {
+            if (page.getName().equals("index.html")) {
+              // we don't need/want you
+              // although i really wish i'd realized this page was i nthe HTML export before
+              // i did all that work to build page-tree.xml from the XML export
+              continue;
+            }
+          
+            System.out.println("input Page URI: " + page.toURI().toString());
+            final Element pageTreePage = pageTree.getPage(page.toURI().toString());
+            final String pageName = pageTree.getPageShortName(pageTreePage);
+            final String title = pageTree.getPageTitle(pageTreePage);
+            final String permalink = pageName + ".html";
+            final File outPage = new File(outputDir, permalink);
+            System.out.println("outPage URI: " + outPage.toURI().toString());
+            
+            if (outPage.exists()) {
+              throw new RuntimeException(permalink + " already exists - multiple files with same shortname: " + page + " => " + outPage);
+            }
+
+            // Confluence encodes &nbsp; as 0xa0.
+            // JSoup API doesn't handle this - change to space before parsing Document
+            String fileContents = readFile(page.getPath());
+            fileContents = fileContents.replace('\u00a0',' ');
+
+            // parse Confluence page
+            Document doc = Jsoup.parse(fileContents);
+            Element mainContent = doc.select("#main-content").first();
+            if (mainContent == null) {
+              throw new RuntimeException(page.getName() + " has no main-content div");
+            }
+            
+            // create clean HTML page
+            Document docOut = Document.createShell(outPage.toURI().toString());
+            docOut.title(title);
+
+            addMetadata(docOut, "page-shortname", pageName);
+            addMetadata(docOut, "page-permalink", permalink);
+            for (Element kid : pageTreePage.children()) {
+              addMetadata(docOut, "page-children", pageTree.getPageShortName(kid));
+            }
+
+            
+            docOut.body().appendChild(mainContent);
+            docOut.normalise();
+
+            cleanupContent(docOut);
+
+            // fix links
+            Elements elements = docOut.select("a[href]");
+            for (Element element : elements) {
+              element.attr("href", fixLink(page, pageTree, element.attr("href")));
+            }
+            
+            // fix (and copy) images
+            for (Element element : docOut.select("img")) {
+              String src = element.attr("src");
+              // attachments can be referenced by other pages
+              String imagePageId = element.attr("data-linked-resource-container-id");
+              String filename = element.attr("data-linked-resource-default-alias");
+              if (null == imagePageId || null == filename ||
+                  "".equals(imagePageId) || "".equals(filename)) {
+                // this some standard comfluence image, not an attacment
+                // assume it's already been copied into place, and leave the src attr alone
+                continue;
+              }
+              String imagePageShortName = pageTree.getPageShortName(pageTree.getPage
+                                                                    (Integer.valueOf(imagePageId)));
+              
+              // copy the file to the desired path if we haven't already...
+              File imagePageDir = new File(imagesDir, imagePageShortName);
+              File imageFile = new File(imagePageDir, filename);
+              if (! imageFile.exists()) {
+                File origImageFile = new File(inputDir, src);
+                if (! origImageFile.exists()) {
+                  throw new RuntimeException("unable to find image: " + origImageFile + " for img in " +
+                                             page.toString());
+                }
+                if (! (imagePageDir.exists() || imagePageDir.mkdirs() ) ) {
+                  throw new RuntimeException("unable to makedirs: " + imagePageDir + " for img: " + src +
+                                             " in " + page.toString());
+                }
+                Files.copy(origImageFile.toPath(), imageFile.toPath());
+              }
+              
+              // rewrite the src attribute
+              element.attr("src", "images/" + imagePageShortName + "/" + filename);
+              // put each image in it's own paragragh (block type elements in adoc)
+              element.wrap("<p></p>");
+            }
+
+            // TODO: need to look for non image attachments and copy them as well
+            // ie: SVG files used to create some of these images
+            
+            docOut.normalise();
+            OutputStream out = new FileOutputStream(outPage);
+            Writer writer = new OutputStreamWriter(out,"UTF-8");
+            BufferedWriter bufWriter = new BufferedWriter(writer);
+            bufWriter.write(docOut.toString());
+            bufWriter.write("\n");
+            bufWriter.close();
+            writer.close();
+            out.close();
+        }
+    }
+
+    static String readFile(String fileName) throws IOException {
+        InputStream in = new FileInputStream(fileName);
+        Reader reader = new InputStreamReader(in,"UTF-8");
+        BufferedReader br = new BufferedReader(reader);
+        try {
+            StringBuilder sb = new StringBuilder();
+            String line = br.readLine();
+            while (line != null) {
+                sb.append(line);
+                sb.append("\n");
+                line = br.readLine();
+            }
+            return sb.toString();
+        } finally {
+            br.close();
+        }
+    }
+
+  static String fixLink(File page, PageTree pageTree, final String href) {
+    try {
+      URI uri = new URI(href);
+      if (uri.isAbsolute()) {
+        // check if it's a javadoc URL and if so update to use our adoc attribute
+        final Matcher matcher = JAVADOC_URL_PATH_PATTERN.matcher(uri.getPath());
+        if (uri.getHost().equals("lucene.apache.org") && matcher.matches()) {
+          String path = matcher.group(2);
+          return (matcher.group(1).equals("core") ? "{lucene-javadocs}" : "{solr-javadocs}") + path;
+        } else {
+          return href;
+        }
+      }
+      // else: not an absoulte URL...
+      
+      // any relative URL will get 'REL_LINK//' prepended so we can post-process
+      // the .adoc files to convert from the "link:xxx" syntax to the <<xxx>> syntax
+      // since pandoc doesn't have native support for that.
+      final String PRE = "REL_LINK//";
+      
+      String path = uri.getPath(); 
+      Element linkedPage = pageTree.getPageIfMatch(path);
+      
+      if ("".equals(path)) { // fragment only URL (ie: same page)
+        return PRE + fixAnchorId(href);
+      } else if (null != linkedPage) {
+        final String linkShortName = pageTree.getPageShortName(linkedPage);
+        path = linkShortName + ".adoc";
+
+        String frag = uri.getFragment();
+        if (null == frag || "".equals(frag)) {
+          // we have to have a fragment for intra-page links to work correctly in asciidoc
+          frag = linkShortName;
+        }
+        frag = fixAnchorId(frag);
+        
+        // HACKish, to ensure we get clean path + ?query? + fragement
+        // (assuming we have any query parts in our realtive urls to worry about)
+        String fixed = new URI(null, null, path, uri.getQuery(), frag).toString();
+        return PRE + fixed;
+        
+      } // else: no idea what this is...
+
+      System.err.println("found odd rel link: " + href + " in " + page.toString());
+      return PRE + href;
+
+      
+    } catch (URISyntaxException se) {
+      System.err.println("found malformed URI " + href + " in " + page.toString());
+      // assume we should leave it alone...
+      return href;
+    }
+
+  }
+  
+  static void addMetadata(Document docOut, String name, String content) {
+      Element meta = new Element(Tag.valueOf("meta"),".");
+      meta.attr("name", name);
+      meta.attr("content", content);
+      docOut.head().appendChild(meta);
+  }
+  
+  
+  static void cleanupContent(Document docOut) {
+    // start cleanup
+    Elements elements = null;
+    
+    // remove side panels (page-internal ToCs)
+    Element sideBar = docOut.select("[data-type=aside]").first();
+    if (null == sideBar) {
+      // sometimes they aren't an 'aside', they are columns cotaining panels
+      elements = docOut.select("div.columnMacro");
+      for (Element element : elements) {
+        if (! element.select("div.toc-macro").isEmpty()) {
+          sideBar = element;
+          break;
+        }
+      }
+    }
+    if (null == sideBar) {
+      // final scnereo: toc by itself in the page body...
+      elements = docOut.select("div.toc-macro");
+      for (Element element : elements) {
+        if (! element.select("div.toc-macro").isEmpty()) {
+          sideBar = element;
+          break;
+        }
+      }
+    }
+    if (sideBar != null) {
+      // TODO: this currently replaces the entire aside/column/panel if there was one...
+      // ...would it be better to leave the other panel text and only remove the div.toc-macro?
+      //  examples:
+      //    Covered in this section:
+      //    Topics covered in this section:
+      //    Filters discussed in this section:
+      //    Algorithms discussed in this section:
+
+      // NOTE: conciously choosing to completely remove the TOC, instead of adding any metadata/macros to it
+      // let the page presentation decide if/when to use a TOC...
+      //
+      sideBar.remove();
+      // sideBar.replaceWith(new TextNode("toc::[]",""));
+      // addMetadata(docOut, "toc", "true");
+      
+
+    } else {
+      // sanity check if we missed any (multiple TOCs on a page?) ...
+      elements = docOut.select("div.toc-macro");
+      if (! elements.isEmpty()) {
+        System.out.println("MISSED A TOC: " + elements.toString());
+        System.exit(-1);
+      }
+    }
+    
+    // unwrap various formatting tags if they are empty
+    // NOTE: explicitly not doing 'span' here because it might be used as an anchor
+    elements = docOut.select("strong, em, p, code, pre, span:not([id])");
+    for (Element element : elements) {
+      if (!element.hasText()) {
+        element.unwrap(); // unwrap not remove! (even w/o text might be inner nodes, ex: img)
+      }
+    }
+
+    // these spans aren't particularly problematic, and will largely be ignored by pandoc either way
+    // but by removing them here, they simplify some of the logic we need in other cleanup later
+    // (notably when looking for tags inside of code)
+    elements = docOut.select("span.external-link, span.nolink, span.confluence-link, code span:not([id])");
+    for (Element element : elements) {
+      element.unwrap();
+    }
+    
+    // move any leading/trailing space from the leading/trailing textNodes of formatting tags
+    // out of the tags
+    // (completley removing it is dangerous because it might create run on "words")
+    for (String tag : Arrays.asList("span", "strong", "em", "code", "p")) { 
+      elements = docOut.getElementsByTag(tag);
+      for (Element element : elements) {
+        // Note: not using textNodes() because our first text node may not be our first child,
+        // we don't want to munge spaces from the middle of our html if it just happens to be the
+        // first direct TextNode 
+        List<Node> kids = element.childNodes();
+        if (! kids.isEmpty()) {
+          if (kids.get(0) instanceof TextNode) {
+            TextNode t = (TextNode) kids.get(0);
+            Matcher m = LEADING_SPACE_PATTERN.matcher(t.text());
+            if (m.matches()) {
+              t.text(m.replaceAll(""));
+              element.before(" ");
+            }
+          }
+          if (kids.get(kids.size()-1) instanceof TextNode) {
+            TextNode t = (TextNode) kids.get(kids.size()-1);
+            Matcher m = TRAILING_SPACE_PATTERN.matcher(t.text());
+            if (m.matches()) {
+              t.text(m.replaceAll(""));
+              element.after(" ");
+            }
+          }
+        }
+      }
+    }
+
+    // this is totally bogus, and yet confluence is doing this...
+    elements = docOut.select("code code");
+    for (Element element : elements) {
+      element.unwrap();
+    }
+    
+    // fake out pandoc when an em or strong tag is inside of a code tag
+    elements = docOut.select("code strong");
+    for (Element element : elements) {
+      element.prependText("**");
+      element.appendText("**");
+      element.unwrap();
+    }
+    elements = docOut.select("code em");
+    for (Element element : elements) {
+      element.prependText("__");
+      element.appendText("__");
+      element.unwrap();
+    }
+
+    // in asciidoc, links can wrap code, but code can not wrap links
+    // so we need to invert the relationship if/when we find it...
+    elements = docOut.select("code > a:only-child");
+    for (Element element : elements) {
+      Element code = element.parent();
+      String href= element.attr("href");
+      element.unwrap();
+      if (! href.equals(code.text())) {
+        // if the entire code block is a URL, we don't need to wrap it in another link
+        // asciidoctor will take care of that for us.
+        code.wrap("<a href=\""+href+"\"></a>");
+      }
+    }
+    
+    // remove confluence styles
+    elements = docOut.select("[style]");
+    for (Element element : elements) {
+      element.removeAttr("style");
+    }
+    // remove confluence themes from <pre> tags
+    elements = docOut.getElementsByTag("pre");
+    for (Element element : elements) {
+      if (element.hasAttr("class")) {
+        Matcher codeType = PRE_CODE_CLASS_PATTERN.matcher(element.attr("class"));
+        if (codeType.find()) {
+          String codeClass = codeType.group(1);
+          // some munging needed in some cases...
+          if (codeClass.equals("html/xml")) {
+            codeClass = "xml";
+          }
+          if (codeClass.equals("js")) {
+            // almost no javascript in ref guide, assume it should really be json
+            codeClass = "json";
+          }
+          if (element.text().startsWith("curl ")) {
+            // if this looks like a curl command, then ignore whatever class might have been in
+            // confluence and treat it as bash
+            codeClass = "bash";
+          }
+          // TODO: other values we should also change here? "powershell" ?
+          element.attr("class", codeClass);
+        } else {
+          element.removeAttr("class");
+        }
+      }
+    }
+
+    // confluence has a nasty habbit of (sometimes) putting named anchors people explicitly define
+    // *inside* a header, instead of around/before it.
+    // so if we find any of these, we need to rearange some things to work around somr problems...
+    // https://github.com/asciidoctor/asciidoctor/issues/1875
+    // 
+    // NOTE: just moving an explicit anchor before the header should work, but because of how id's on headers
+    // are treated in asciidoc, and some weirdness in how asciidoctor treats multiple anchors
+    // delcared in front of a header this causes all but one of the anchors to be ignored...
+    //
+    // https://github.com/asciidoctor/asciidoctor/issues/1874
+    //
+    // because of this, we'll use the "explicitly" defined ancor macro from confluence as our "main"
+    // id for the header, and move the existing header id to it's own declaration.
+    //
+    // that should result in both still existing in the final adoc file (so they are easy to grep for)
+    // but the one that is most likely to have links to it will be the one used by default in generated html.
+    for (int level = 1; level < 7; level++) {
+      final String h = "h" + level;
+      elements = docOut.getElementsByTag(h);
+      for (Element header : elements) {
+        // first see if we are immediately preceeded by an explicit anchor macro...
+        // (any wrapping <p> tags should have already been uprapped for us)
+        Element previous = header.previousElementSibling();
+        if (null != previous && "span".equals(previous.tagName()) && previous.classNames().contains("confluence-anchor-link")) {
+          // swap the id from this "previous" macro declaration with the "id" of the as our header
+          final String oldId = header.attr("id");
+          header.attr("id", previous.attr("id"));
+          previous.attr("id", oldId);
+        }
+          
+        // next, look for any anchors declared inside the header...
+        Elements inner = header.getElementsByClass("confluence-anchor-link");
+        for (Element anchor : inner) {
+          final String oldId = header.attr("id");
+          header.attr("id", anchor.attr("id"));
+          if (null != oldId) {
+            // flip id and move the anchor before the header
+            anchor.attr("id", oldId);
+            header.before(anchor);
+          } else {
+            // just remove the anchor completley
+            // (don't think this code path is possible, but including for completeness)
+            anchor.remove();
+          }
+        }
+      }
+    }
+    
+    // replace icon text
+    elements = docOut.getElementsByClass("confluence-information-macro");
+    for (Element element : elements) {
+      final String admonishment = getAdmonishment(element);
+      Elements titles = element.select(".title");
+      if (1 < titles.size()) {
+        System.err.println("admonishment macro has more then 1 title: " + element.outerHtml());
+        System.exit(-1);
+      }
+
+      // it's easier to post-process this, then to try and fight the html->pandoc->adoc conversion
+      for (Element title : titles) { // only one, loop is easy
+        title.prependText("TODO_ADMON_TITLE:");
+        element.before(title); // move it before the block
+      }
+      element.prependChild((new Element(Tag.valueOf("p"), ".")).prependText("[" + admonishment + "]===="));
+      element.appendChild((new Element(Tag.valueOf("p"), ".")).prependText("===="));
+    }
+
+    // unwrap various block tags if they are empty
+    for (String tag : Arrays.asList("div","tbody")) {
+      elements = docOut.getElementsByTag(tag);
+      for (Element element : elements) {
+        element.unwrap(); // unwrap not remove! (might be inner nodes, ex: img)
+      }
+    }
+    
+    // remove breaks -- TODO: why?
+    elements = docOut.getElementsByTag("br");
+    for (Element element : elements) {
+      element.remove();
+    }
+
+    // work around https://github.com/asciidoctor/asciidoctor/issues/1873
+    elements = docOut.select("[id]");
+    for (Element element : elements) {
+      final String oldId = element.attr("id");
+      final String newId = fixAnchorId(oldId);
+      if (! oldId.equals(newId)) {
+        // would love to use jsoup's Comment class, but it doesn't survive pandoc
+        // ironically, this does...
+        Element fakeComment = new Element(Tag.valueOf("div"), "");
+        fakeComment.text("// OLD_CONFLUENCE_ID: " + oldId);
+        element.before(fakeComment);
+        element.attr("id", newId);
+      }
+    }
+
+    // pandoc gets really confused when <ol>s get nested, add a comment pointing out
+    // manual cleanup is needed
+    elements = docOut.select("ol:has(ol, ul), ul:has(ol)");
+    LIST: for (Element list : elements) {
+      // if we are wrapped in an outer list, nothing to do - already done at top level
+      for (Element parent : list.parents()) {
+        if ("ol".equals(parent.tagName()) || "ul".equals(parent.tagName())) {
+          continue LIST;
+        }
+      }
+      // would love to use jsoup's Comment class, but it doesn't survive pandoc
+      // ironically, this does...
+      Element fakeComment = new Element(Tag.valueOf("div"), "");
+      fakeComment.text("// TODO: This '"+list.tagName()+"' has problematic nested lists inside of it, needs manual editing");
+      list.before(fakeComment);
+    }
+
+    // table cells containing structural elements are problematic in PDFs...
+    elements = docOut.select("table:has(ol, ul, p ~ p, div, pre, table)");
+    TABLE: for (Element table : elements) {
+      // if we are wrapped in another table, nothing to do - already done at top level
+      for (Element parent : table.parents()) {
+        if ("table".equals(parent.tagName())) {
+          continue TABLE;
+        }
+      }
+      // would love to use jsoup's Comment class, but it doesn't survive pandoc
+      // ironically, this does...
+      Element fakeComment = new Element(Tag.valueOf("div"), "");
+      fakeComment.text("// TODO: This table has cells that won't work with PDF: https://github.com/ctargett/refguide-asciidoc-poc/issues/13");
+      table.before(fakeComment);
+    }
+
+    // final modification: get rid of any leading spaces in paragraphs
+    // (otherwise asciidoctor will treat them as a type of code formatting
+    elements = docOut.select("p > span:not([id]):first-child");
+    for (Element element : elements) {
+      if (ONLY_SPACE_PATTERN.matcher(element.html()).matches()) {
+        element.remove();
+      }
+    }
+    
+    // in general, pandoc/asciidoctor has problems with tags inside of "code" so log if we have anything
+    elements = docOut.select("code:has(*)");
+    for (Element element : elements) {
+      System.out.println("NOTE: code tag w/nested tags: " + element.outerHtml());
+    }
+      
+    
+    docOut.normalise();
+  }
+
+  /** 
+   * work around https://github.com/asciidoctor/asciidoctor/issues/1873
+   * needs to be called on all "id" attributes, as well as any anchor text in (local) links
+   */
+  public static String fixAnchorId(String id) {
+    Matcher m = ANCHOR_ID_CLEANER.matcher(id);
+    return m.replaceAll("_");
+  }
+
+  /**
+   * convert confluence admonishment macor types to the "equivilent" adoc types we want to use
+   */
+  public static String getAdmonishment(Element e) {
+    String admon = null;
+    if (e.hasClass("confluence-information-macro-information")) {
+      return "NOTE";
+    }
+    if (e.hasClass("confluence-information-macro-tip")) {
+      return "TIP";
+    }
+    if (e.hasClass("confluence-information-macro-note")) {
+      return "IMPORTANT";
+    }
+    if (e.hasClass("confluence-information-macro-warning")) {
+      return "WARNING";
+    }
+    System.err.println("No admonishment mapping for: " + e.outerHtml());
+    System.exit(-1);
+    return null;
+  }
+  
+  /**
+   * Wraps a (Jsoup) "DOM" of the <code>page-tree.xml</code> file with convinience methods
+   * for getting the names, shortnames, and kids of various pages
+   */
+  private static final class PageTree {
+    private static final Pattern HTML_EXPORT_FILENAME = Pattern.compile("^.*?\\D?(\\d+)\\.html$");
+    private static final Pattern SHORT_NAME_CLEANER = Pattern.compile("[^a-z0-9]+");
+    // Jsoups XML parsing is easier to work with then javax, especially getById
+    private final Document dom;
+    public PageTree(File pageTreeXml) throws Exception {
+      try (FileInputStream fis = new FileInputStream(pageTreeXml)) {
+        this.dom = Jsoup.parse(fis, null, pageTreeXml.toURI().toString(), Parser.xmlParser());
+      }
+    }
+    public Element getPage(int id) {
+      final Element ele = dom.getElementById(""+id);
+      if (null == ele) {
+        throw new NullPointerException("can't find DOM element with id: " + id);
+      }
+      return ele;
+    }
+    public Element getPage(String htmlFilePath) {
+      Element page = getPageIfMatch(htmlFilePath);
+      if (null != page) {
+        return page;
+      } // else...
+      throw new RuntimeException("Can't match page path pattern for html path: " + htmlFilePath);
+    }
+    public Element getPageIfMatch(String htmlFilePath) {
+      if (null == htmlFilePath || 0 == htmlFilePath.length()) {
+        return null;
+      }
+      Matcher m = HTML_EXPORT_FILENAME.matcher(htmlFilePath);
+      if (m.matches()) {
+        int id = Integer.valueOf(m.group(1));
+        return getPage(id);
+      } // else...
+      return null;
+    }
+    public String getPageTitle(Element page) {
+      String title = page.attr("title");
+      if (null == title) {
+        throw new NullPointerException("Page has null title attr");
+      }
+      return title;
+    }
+    public String getPageShortName(Element page) {
+      Matcher m = SHORT_NAME_CLEANER.matcher(getPageTitle(page).toLowerCase(Locale.ROOT));
+      return m.replaceAll("-");
+    }
+    public String getPageShortName(String htmlFilePath) {
+      return getPageShortName(getPage(htmlFilePath));
+    }
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/page-hierarchy.xsl
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/page-hierarchy.xsl b/solr/confluence-export/conversion-tools/page-hierarchy.xsl
new file mode 100644
index 0000000..39cefce
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/page-hierarchy.xsl
@@ -0,0 +1,81 @@
+<xsl:stylesheet version="1.0"
+                xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+  <!-- A style sheet that can be applied to entities.xml from a Confluence dump
+       and produces just the bare bones data about the hierarchy of pages in (the|each) space,
+       in the order they appear as children of their parent
+  -->
+  
+  <xsl:output indent="yes"/>
+  
+  <xsl:template match="/">
+    <confluence>
+      <xsl:apply-templates select="//object[@class='Space']"/>
+    </confluence>
+  </xsl:template>
+  
+  <xsl:template match="object[@class='Space']">
+    <space>
+      <xsl:attribute name="id"><xsl:value-of select="./id/text()"/></xsl:attribute>
+      <xsl:attribute name="name"><xsl:value-of select="./property[@name='name']/text()"/></xsl:attribute>
+
+      <!-- can't just look for "pages that have no parent" because that will also match old versions.
+           (even the historical versions of pages have a status of 'current')
+      -->
+      <!--
+          So instead look for any page that is part of the space, and does have some position
+          (sort order in space), but does not have a parent
+      -->
+      <xsl:apply-templates select="//object[@class='Page'][boolean(property[@name='position']/text())][not(property[@name='parent'])][property[@name='space']/id/text()=current()/id/text()]" >
+        <!-- NOTE: sort duplicated in recursive Page template below -->
+        <xsl:sort data-type="number" order="ascending"
+                  select="property[@name='position']/text()" />
+        <!-- aparently pages only have position if user has explicitly sorted?
+             otherwise it looks like they default to sort by title? -->
+        <xsl:sort data-type="text" order="ascending"
+                  select="property[@name='title']/text()" />
+      </xsl:apply-templates>
+    </space>
+  </xsl:template>
+
+  <!-- NOTE: This template is recursive -->
+  <xsl:template match="object[@class='Page']">
+    <page>
+      <xsl:attribute name="id"><xsl:value-of select="./id/text()"/></xsl:attribute>
+      <xsl:attribute name="title"><xsl:value-of select="./property[@name='title']/text()"/></xsl:attribute>
+      <!-- add parent info redundently in case it's helpful -->
+      <xsl:if test="./property[@name='parent']/id">
+        <xsl:attribute name="parent"><xsl:value-of select="./property[@name='parent']/id/text()"/></xsl:attribute>
+      </xsl:if>
+
+      <!-- the sort order if explicitly set by a confluence user at some point
+           If this has never been set for a group of children, it aparently defaults to
+           sorting all those children by alpha page title
+      -->
+      <xsl:if test="./property[@name='position']/node()">
+        <xsl:attribute name="sort"><xsl:value-of select="./property[@name='position']/text()"/></xsl:attribute>
+      </xsl:if>
+      
+      <!-- NOTE: doing a for-each on collection[@name='children'] won't work....
+           collection isn't sorted, need to use "position" property from the Pages themselves
+           
+           <xsl:for-each select="collection[@name='children']/element[@class='Page']/id/text()">
+           <xsl:apply-templates select="//object[@class='Page'][id/text()=current()]"/>
+           </xsl:for-each>
+      -->
+      
+      <!-- instead we go out and select every page that has a parent which matches our id
+           (thank god for the parent property) and (recursively) apply templates in "position" sorted order
+      -->
+      <xsl:apply-templates select="//object[@class='Page'][property[@name='parent']/id/text()=current()/id/text()]">
+        <!-- NOTE: sort duplicated in Space template above -->
+        <xsl:sort data-type="number" order="ascending"
+                  select="property[@name='position']/text()" />
+        <!-- aparently pages only have position if user has explicitly sorted?
+             otherwise it looks like they default to sort by title? -->
+        <xsl:sort data-type="text" order="ascending"
+                  select="property[@name='title']/text()" />
+      </xsl:apply-templates>
+    </page>
+  </xsl:template>
+  <xsl:template match="object" /><!-- No-Op for other types of objects -->
+</xsl:stylesheet>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/post-process-adocs.pl
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/post-process-adocs.pl b/solr/confluence-export/conversion-tools/post-process-adocs.pl
new file mode 100755
index 0000000..c471558
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/post-process-adocs.pl
@@ -0,0 +1,39 @@
+#!perl -i
+
+use strict;
+use warnings;
+
+while (my $line = <>) {
+    # pandoc uses '=========...' syntax for doc title, we want shorter "= TITLE" syntax
+    if (1 == $.) {
+	$line = "= $line";
+    } elsif ((2 == $.) && $line =~ /^=+$/) {
+	next; # skip this line completley
+    }
+
+    # table syntax doesn't need to be so verbose
+    $line =~ s{^\|\={3,}+$}{|===};
+	
+    # fix up relative links (in place edit) -- NOTE: links to anchor in same page get '#' stripped
+    $line =~ s{link:REL_LINK//#?(.*?)\[(.*?)\]}{\<\<$1,$2\>\>}g;
+
+    # fix up javadoc links, since pandoc escapes our attribute syntax
+    $line =~ s<link:%7B(.*?)%7D><{$1}>g;
+
+    # switch all images from inline to 'block' (double colon) and put on their own line of the file
+    # TODO: any attributes we want to add to every image?
+    $line =~ s{image:(.*?)\[(.*?)\]}{image::$1\[$2\]\n}g;
+
+    # admonishments...
+    if ($line =~ s{^TODO_ADMON_TITLE:}{.}) {
+	# next line should be blank, trash it
+	my $trash = <>;
+	$trash =~ /^$/ or die "not a blank trash line: $trash";
+    }
+    $line =~ s{^(\[\w+\])====$}{$1\n====};
+
+    # fixup obviously intended quoted code (otherwise "`foo`" just gets curly quoted)
+    $line =~ s{"`(\w+)`"}{"```$1```"}g;
+    
+    print $line;
+}

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/toAsciidoc.sh
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/toAsciidoc.sh b/solr/confluence-export/conversion-tools/toAsciidoc.sh
new file mode 100755
index 0000000..e6238be
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/toAsciidoc.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# PoC demonstration of complete migration from Confluence
+# script to walk over pagetree of cleaned-up HTML pages from Confluence
+# process html to asciidoc
+# reconvert asciidoc to html
+
+
+# parent dir of script until/unless we move it
+WORK_DIR=$(realpath -L "$(dirname $0)/../")
+
+if [ ! -d $WORK_DIR ]
+then
+    echo "$WORK_DIR does not exist (as a directory)"
+    exit -1
+fi
+
+# check that we have the expected version of pandoc
+PANDOC_VER=`pandoc --version | head -1 | cut -d' ' -f 2 | cut -d'.' -f 1-2`
+if [ $PANDOC_VER != "1.17" ]
+then
+    echo "Only tested with pandoc 1.17, you are using $PANDOC_VER"
+    exit -1
+fi
+
+PANDOC_TEMPLATE="$WORK_DIR/conversion-tools/custom.pandoc.template"
+if [ ! -e $PANDOC_TEMPLATE ]
+then
+    echo "$PANDOC_TEMPLATE does not exist"
+    exit -1
+fi
+
+HTML_DIR="$WORK_DIR/cleaned-export"
+ASCII_DIR="$WORK_DIR/../solr-ref-guide/src"
+
+rm $ASCII_DIR/*.adoc
+
+echo "Coping images..."
+rm -rf $ASCII_DIR/images
+cp -r $HTML_DIR/images $ASCII_DIR/images
+
+for x in `find $HTML_DIR -name "*.html"`
+do
+    echo $x;
+    FNAME=`echo ${x} | sed -e "s#${HTML_DIR}/##"`
+    DIRNAME=$(dirname ${FNAME})
+    mkdir -p "$ASCII_DIR/$DIRNAME"
+    
+    # convert to .asciidoc format using pandoc
+    pandoc $HTML_DIR/$FNAME -f html -t asciidoc -i --parse-raw --wrap=none --standalone --atx-headers --template=$PANDOC_TEMPLATE -o ${ASCII_DIR}/${FNAME%.*}.adoc
+
+    perl "$WORK_DIR/conversion-tools/post-process-adocs.pl" ${ASCII_DIR}/${FNAME%.*}.adoc
+done;

[2/2] lucene-solr:jira/solr-10290: SOLR-10290: Add ivy and build files, etc.

Posted by ct...@apache.org.

SOLR-10290: Add ivy and build files, etc.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ec324b29
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ec324b29
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ec324b29

Branch: refs/heads/jira/solr-10290
Commit: ec324b294ce858733dd014399a27ccb2cb513def
Parents: 8736246
Author: Cassandra Targett <ct...@apache.org>
Authored: Thu Mar 16 11:51:06 2017 -0500
Committer: Cassandra Targett <ct...@apache.org>
Committed: Thu Mar 16 11:51:06 2017 -0500

----------------------------------------------------------------------
 solr/solr-ref-guide/.gitignore  |   1 +
 solr/solr-ref-guide/README.adoc |  23 +++++
 solr/solr-ref-guide/build.xml   | 188 +++++++++++++++++++++++++++++++++++
 solr/solr-ref-guide/ivy.xml     |  10 ++
 4 files changed, 222 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/.gitignore
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/.gitignore b/solr/solr-ref-guide/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/solr/solr-ref-guide/.gitignore
@@ -0,0 +1 @@
+build/

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/README.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/README.adoc b/solr/solr-ref-guide/README.adoc
new file mode 100644
index 0000000..dcca883
--- /dev/null
+++ b/solr/solr-ref-guide/README.adoc
@@ -0,0 +1,23 @@
+= Solr Ref Guide
+
+This is the source for the Solr Reference Guide.
+
+Raw content is stored in Asciidoc (`.adoc`) formated files in the `src/` directory.
+
+These files are processed with AsciiDoctor in 2 different ways:
+
+* Via 'Jekyll' to build an HTML browsable version of the Ref Guide
+** NOTE: This currently requires that you have already installed `ruby`, `jekyll`, and the `jekyll-asciidoc` plugin locally
+* Via `asciidoctor-ant` to build the officially released PDF version of the Ref Guide
+
+For details on building the ref guide, see `ant -p`.
+
+Key directories to be aware of:
+
+* `src` - where all human edited `*.adoc` files realted to the Guide live, as well as various configuration, theme, and template files.
+* `tools` - custom Java code for parsing metadata in our `src/*.adoc` files to produce some `_data/` files for site & pdf navigation purposes.
+* `build/content` - a copy of the `src` dir generated by ant where:
+** `*.template` files are processed to replace ant properties with their runtime values
+** some `build/content/_data` files are generated by our java tools based header attributes from each of the `*.adoc` files
+* `build/html-site` - HTML generated version of the ref guide
+* `build/apache-solr-ref-guide-X.Y.pdf` - PDF generated version of the ref guide

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/build.xml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/build.xml b/solr/solr-ref-guide/build.xml
new file mode 100644
index 0000000..444db44
--- /dev/null
+++ b/solr/solr-ref-guide/build.xml
@@ -0,0 +1,188 @@
+<project xmlns:asciidoctor="antlib:org.asciidoctor.ant" xmlns:ivy="antlib:org.apache.ivy.ant" >
+
+  <!-- TODO: once we're part of the lucene-solr git repo,
+       remove these lines and replace them with an import of common-build.xml -->
+  <property name="version.base" value="6.2.42" />
+  <tstamp>
+    <format property="current.year" pattern="yyyy"/>
+    <format property="DSTAMP" pattern="yyyy-MM-dd"/>
+  </tstamp>
+  <property name="ivy.sync" value="true"/>
+    <condition property="ivy.symlink">
+    <os family="unix"/>
+  </condition>
+  <!-- <import file="....../common-build.xml"/> -->
+
+
+
+
+  <!-- properties to use in our docs -->
+  <loadresource property="solr-docs-version">
+    <propertyresource name="version.base"/>
+    <filterchain>
+      <tokenfilter>
+        <filetokenizer/>
+        <replaceregex pattern="^(\d+\.\d+)(|\..*)$" replace="\1" flags="s"/>
+      </tokenfilter>
+    </filterchain>
+  </loadresource>
+  <loadresource property="solr-docs-version-path">
+    <propertyresource name="solr-docs-version"/>
+    <filterchain>
+      <tokenfilter>
+        <filetokenizer/>
+        <replaceregex pattern="^(\d+)\.(\d+)(|\..*)$" replace="\1_\2_0" flags="s"/>
+      </tokenfilter>
+    </filterchain>
+  </loadresource>
+  <property name="solr-javadocs" value="https://lucene.apache.org/solr/${solr-docs-version-path}/" />
+  <property name="lucene-javadocs" value="https://lucene.apache.org/core/${solr-docs-version-path}/" />
+
+  <target name="resolve">
+    <mkdir dir="build/lib"/>
+    <ivy:retrieve type="jar,bundle" sync="${ivy.sync}" log="download-only" symlink="${ivy.symlink}"
+                  pattern="build/lib/[artifact]-[revision].[ext]" />
+  </target>
+
+  <property name="build.content.dir" location="build/content" />
+  <property name="main-page" value="apache-solr-reference-guide" />
+  <property name="pdf-filename" value="apache-solr-ref-guide-${solr-docs-version}.pdf" />
+
+  <!-- ====== TOOLS FOR GENERATING/VALIDATING BITS OF THE SITE / PDF ======= -->
+  <property name="tools-jar-name" value="solr-ref-guide-tools.jar" />
+  <path id="tools-compile-classpath">
+    <fileset dir="build/lib">
+      <include name="**/*.jar"/>
+      <exclude name="**/${tools-jar-name}" />
+    </fileset>
+  </path>
+  <path id="tools-run-classpath">
+    <fileset dir="build/lib">
+      <include name="**/*.jar"/>
+    </fileset>
+  </path>
+
+  <target name="clean">
+    <delete dir="build"/>
+  </target>
+
+  <target name="build-tools-jar" depends="resolve" description="Builds the custom java tools use use for generating some data files from page metdata">
+    <mkdir dir="build/classes"/>
+    <javac debug="yes"
+           debuglevel="source,lines,vars"
+           destdir="build/classes"
+           includeantruntime="false">
+      <compilerarg value="-Xlint:all"/>
+      <classpath refid="tools-compile-classpath"/>
+      <src path="tools/"/>
+    </javac>
+    <jar destfile="build/lib/${tools-jar-name}">
+      <fileset dir="build/classes"
+               includes="**/*.class"/>
+    </jar>
+  </target>
+
+  <target name="build-init" description="Prepares the build/content dir, copying over src files and transforming *.template files in the process">
+    <delete dir="${build.content.dir}" />
+    <mkdir dir="${build.content.dir}" />
+    <echo>Copying all non template files from src ...</echo>
+    <copy todir="${build.content.dir}">
+      <fileset dir="src">
+        <exclude name="**/*.template"/>
+      </fileset>
+    </copy>
+    <echo>Copy (w/prop replacement) any template files from src...</echo>
+    <copy todir="${build.content.dir}">
+      <fileset dir="src">
+        <include name="**/*.template"/>
+      </fileset>
+      <mapper type="glob" from="*.template" to="*"/>
+      <filterchain>
+        <expandproperties/>
+      </filterchain>
+    </copy>
+  </target>
+
+  <target name="build-nav-data-files" depends="build-init,build-tools-jar" description="creates nav based data files needed by both the html and pdf artifacts">
+    <mkdir dir="${build.content.dir}/_data"/>
+    <java classname="BuildNavAndPDFBody"
+          failonerror="true"
+          fork="true">
+      <classpath refid="tools-run-classpath"/>
+      <arg value="${build.content.dir}"/>
+      <arg value="${main-page}"/>
+    </java>
+  </target>
+
+  <target name="check-links-and-anchors" depends="build-init,build-tools-jar" description="Parse the HTML site files to check for problematic links or anchors">
+    <java classname="CheckLinksAndAnchors"
+          failonerror="true"
+          fork="true">
+      <classpath refid="tools-run-classpath"/>
+      <arg value="${basedir}/build/html-site"/>
+    </java>
+  </target>
+  
+  <!-- ====== PDF Build ======= -->
+  <target name="build-pdf"
+          depends="build-nav-data-files,resolve"
+          description="Builds a PDF">
+    <taskdef uri="antlib:org.asciidoctor.ant" resource="org/asciidoctor/ant/antlib.xml"
+             classpathref="tools-run-classpath"/>
+    <asciidoctor:convert
+                 sourceDirectory="${build.content.dir}/pdf"
+                 sourceDocumentName="SolrRefGuide-all.adoc"
+                 baseDir="${build.content.dir}"
+                 outputDirectory="build"
+                 backend="pdf"
+                 extensions="adoc"
+                 sourceHighlighter="coderay"
+                 embedAssets="true"
+                 imagesDir="${build.content.dir}"
+                 doctype="book"
+                 safemode="unsafe">
+      <attribute key="icons" value="font" />
+      <attribute key="icon-set" value="fa" />
+      <attribute key="docinfo!" value='' />
+      <attribute key="pdf-stylesDir" value="./pdf/themes"/>
+      <attribute key="pdf-style" value="refguide"/>
+      <attribute key="pdf-fontsDir" value="./pdf/fonts"/>
+      <attribute key="pagenums" value='' />
+      <attribute key="figure-caption!" value='' />
+      <attribute key="idprefix" value='' />
+      <attribute key="idseparator" value='-' />
+      <!-- attributes used in adoc files -->
+      <!-- NOTE: If you add any attributes here for use in adoc files, you almost certainly need to also add
+           them to the _config.yml.template file for building the jekyll site as well
+      -->
+      <attribute key="solr-docs-version" value="${solr-docs-version}" />
+      <attribute key="solr-javadocs" value="${solr-javadocs}" />
+      <attribute key="lucene-javadocs" value="${lucene-javadocs}" />
+      <attribute key="build-date" value="${DSTAMP}" />
+      <attribute key="build-year" value="${current.year}" />
+    </asciidoctor:convert>
+    <move file="build/SolrRefGuide-all.pdf" tofile="build/${pdf-filename}" />
+    <echo>Finished Building ${basedir}/build/${pdf-filename}</echo>
+  </target>
+
+  <!-- ======= HTML Site Build =======
+       Builds site with Jekyll.
+       This (for now) assumes that Jekyll (http://jekyllrb.com) is installed locally. -->
+  <target name="build-site"
+          depends="build-init,build-nav-data-files"
+          description="Builds an HTML Site w/Jekyll">
+    <echo>Running Jekyll...</echo>
+    <exec executable="jekyll" dir="${build.content.dir}">
+      <arg value="build"/>
+    </exec>
+    <echo>Ready to browse site: ${basedir}/build/html-site/${main-page}.html</echo>
+  </target>
+
+  <target name="build"
+          description="Builds both a PDF and HTML versions of the ref guide"
+          depends="build-pdf,build-site">
+    <echo>PDF: ${basedir}/build/${pdf-filename}</echo>
+    <echo>SITE: ${basedir}/build/html-site/${main-page}.html</echo>
+  </target>
+
+</project>

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/ivy.xml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/ivy.xml b/solr/solr-ref-guide/ivy.xml
new file mode 100644
index 0000000..f625aed
--- /dev/null
+++ b/solr/solr-ref-guide/ivy.xml
@@ -0,0 +1,10 @@
+<ivy-module version="2.0">
+  <info organisation="org.apache.solr" module="ref-guide-tools"/>
+  <dependencies>
+    <!-- v1.5.3 of asciidoctor-ant includes asciidoctorj-pdf 1.5.0-alpha.11,
+         which is the same as asciidoctor-pdf 1.5.0-alpha.11  -->
+    <dependency org="org.asciidoctor" name="asciidoctor-ant" rev="1.5.3" />
+    <dependency org="org.json" name="json" rev="20160810" />
+    <dependency org="org.jsoup" name="jsoup" rev="1.8.2" />
+  </dependencies>
+</ivy-module>