You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ct...@apache.org on 2017/03/16 16:53:17 UTC
[1/2] lucene-solr:jira/solr-10290: SOLR-10290: Add Confluence
conversion tools
Repository: lucene-solr
Updated Branches:
refs/heads/jira/solr-10290 e825f0a75 -> ec324b294
SOLR-10290: Add Confluence conversion tools
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/8736246e
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/8736246e
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/8736246e
Branch: refs/heads/jira/solr-10290
Commit: 8736246ee51e8fbf08ee8d08a1dc22cdeba50d97
Parents: e825f0a
Author: Cassandra Targett <ct...@apache.org>
Authored: Thu Mar 16 11:49:12 2017 -0500
Committer: Cassandra Targett <ct...@apache.org>
Committed: Thu Mar 16 11:49:12 2017 -0500
----------------------------------------------------------------------
.../conversion-tools/custom.pandoc.template | 38 ++
.../conversion-tools/jsoup/.gitignore | 1 +
.../conversion-tools/jsoup/build.xml | 91 +++
.../lucidworks/docparser/HtmlFileFilter.java | 10 +
.../lucidworks/docparser/ScrapeConfluence.java | 645 +++++++++++++++++++
.../conversion-tools/page-hierarchy.xsl | 81 +++
.../conversion-tools/post-process-adocs.pl | 39 ++
.../conversion-tools/toAsciidoc.sh | 53 ++
8 files changed, 958 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/custom.pandoc.template
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/custom.pandoc.template b/solr/confluence-export/conversion-tools/custom.pandoc.template
new file mode 100644
index 0000000..5993767
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/custom.pandoc.template
@@ -0,0 +1,38 @@
+$if(titleblock)$
+$title$
+$for(author)$
+:author: $author$
+$endfor$
+$if(date)$
+:date: $date$
+$endif$
+$if(toc)$
+:toc:
+$endif$
+$if(page-shortname)$
+:page-shortname: $page-shortname$
+$endif$
+$if(page-permalink)$
+:page-permalink: $page-permalink$
+$endif$
+$if(page-tags)$
+:page-tags: $for(page-tags)$[$page-tags$]$sep$, $endfor$
+$endif$
+$if(page-children)$
+:page-children: $for(page-children)$$page-children$$sep$, $endfor$
+$endif$
+
+$endif$
+$for(header-includes)$
+$header-includes$
+
+$endfor$
+$for(include-before)$
+$include-before$
+
+$endfor$
+$body$
+$for(include-after)$
+
+$include-after$
+$endfor$
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/.gitignore
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/.gitignore b/solr/confluence-export/conversion-tools/jsoup/.gitignore
new file mode 100644
index 0000000..378eac2
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/.gitignore
@@ -0,0 +1 @@
+build
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/build.xml
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/build.xml b/solr/confluence-export/conversion-tools/jsoup/build.xml
new file mode 100644
index 0000000..d7cc560
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/build.xml
@@ -0,0 +1,91 @@
+<?xml version="1.0" encoding="ASCII"?>
+<project>
+
+ <property name="version"
+ value="1.0"/>
+
+ <property name="jar"
+ value="build/parsers-${version}.jar"/>
+
+ <path id="classpath">
+ <pathelement location="${jar}"/>
+ <pathelement location="lib/jsoup-1.8.2.jar"/>
+ </path>
+
+ <target name="clean">
+ <delete dir="build"/>
+ </target>
+
+ <target name="jar">
+ <mkdir dir="build/classes"/>
+ <javac debug="yes"
+ debuglevel="source,lines,vars"
+ destdir="build/classes"
+ includeantruntime="false">
+ <compilerarg value="-Xlint:all"/>
+ <classpath refid="classpath"/>
+ <src path="src/"/>
+ </javac>
+ <jar destfile="${jar}">
+ <fileset dir="build/classes"
+ includes="**/*.class"/>
+ </jar>
+ </target>
+
+ <property name="work.dir" location="../../"/>
+ <property name="from.dir" location="${work.dir}/raw-export"/>
+ <property name="cleaned.dir" location="${work.dir}/cleaned-export"/>
+ <property name="entities.xml.path" location="${work.dir}/raw-xml-export/entities.xml"/>
+ <property name="page-tree.xml.path" location="${work.dir}/page-tree.xml"/>
+
+ <target name="-dir-check">
+ <fail message="Raw (HTML) confluence export dir does not exist: ${from.dir}">
+ <condition>
+ <not>
+ <available file="${from.dir}" />
+ </not>
+ </condition>
+ </fail>
+ <fail message="Can't find entities.xml in raw (XML) confluence export dir: ${entities.xml.path}">
+ <condition>
+ <not>
+ <available file="${entities.xml.path}" />
+ </not>
+ </condition>
+ </fail>
+ </target>
+
+ <target name="-page-tree-check">
+ <uptodate property="page-tree.xml.uptodate"
+ srcfile="${entities.xml.path}"
+ targetfile="${page-tree.xml.path}"/>
+ </target>
+
+ <target name="page-tree"
+ depends="-dir-check,-page-tree-check"
+ unless="${page-tree.xml.uptodate}">
+ <xslt in="${entities.xml.path}" out="${page-tree.xml.path}"
+ style="../page-hierarchy.xsl" />
+ </target>
+
+ <target name="scrape"
+ depends="-dir-check,jar,page-tree">
+ <delete dir="${cleaned.dir}" />
+ <mkdir dir="${cleaned.dir}"/>
+ <mkdir dir="${cleaned.dir}/images"/>
+ <!-- any "images" in the raw dump are icons that are definitely used by some page
+ (the export code already filtered from the set of all images in confluence) -->
+ <copy todir="${cleaned.dir}/images">
+ <fileset dir="${from.dir}/images" />
+ </copy>
+ <java classname="com.lucidworks.docparser.ScrapeConfluence"
+ failonerror="true"
+ fork="true">
+ <classpath refid="classpath"/>
+ <arg value="${from.dir}"/>
+ <arg value="${page-tree.xml.path}"/>
+ <arg value="${cleaned.dir}"/>
+ </java>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
new file mode 100644
index 0000000..9bc3b8c
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/HtmlFileFilter.java
@@ -0,0 +1,10 @@
+package com.lucidworks.docparser;
+
+import java.io.File;
+import java.io.FileFilter;
+
+public class HtmlFileFilter implements FileFilter {
+ public boolean accept(File pathname) {
+ return pathname.getName().toLowerCase().endsWith("htm") || pathname.getName().toLowerCase().endsWith("html");
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
new file mode 100644
index 0000000..5497883
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/jsoup/src/com/lucidworks/docparser/ScrapeConfluence.java
@@ -0,0 +1,645 @@
+package com.lucidworks.docparser;
+
+import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.nio.file.Files;
+import java.util.Arrays;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Locale;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
+import org.jsoup.nodes.Node;
+import org.jsoup.nodes.TextNode;
+import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+import org.jsoup.select.Elements;
+import org.jsoup.select.NodeVisitor;
+
+/**
+ * Extract body of Confluence page using Jsoup library.
+ * This creates an identical (flat) directory structured containing cleaned up documents
+ */
+public class ScrapeConfluence {
+ static final Pattern PRE_CODE_CLASS_PATTERN = Pattern.compile("brush:\\s+([^;]+)");
+ static final Pattern ANCHOR_ID_CLEANER = Pattern.compile("[^A-Za-z0-9\\.\\-\\_\\#]+");
+ static final Pattern LEADING_SPACE_PATTERN = Pattern.compile("\\A\\s+");
+ static final Pattern TRAILING_SPACE_PATTERN = Pattern.compile("\\s+\\Z");
+ static final Pattern ONLY_SPACE_PATTERN = Pattern.compile("\\A\\s*\\Z");
+ static final Pattern JAVADOC_URL_PATH_PATTERN = Pattern.compile("/(solr|core)/\\d+_\\d+_\\d+(/.*)");
+
+ public static void main(String[] args) throws Exception {
+ if (args.length < 3) {
+ System.err.println("usage: ScrapeConfluence "
+ + "<indir> <page-tree.xml> <outdir>");
+ System.exit(-1);
+ }
+ File inputDir = new File(args[0]);
+ File pageTreeXmlFile = new File(args[1]);
+ PageTree pageTree = new PageTree(pageTreeXmlFile);
+ File outputDir = new File(args[2]);
+ File imagesDir = new File(outputDir, "images");
+ if (! (imagesDir.exists() || imagesDir.mkdirs() ) ) {
+ throw new RuntimeException("Unable to create images dir: " + imagesDir.toString());
+ }
+
+ HtmlFileFilter htmlFilter = new HtmlFileFilter();
+ File[] pages = inputDir.listFiles(htmlFilter);
+ for (File page : pages) {
+ if (page.getName().equals("index.html")) {
+ // we don't need/want you
+ // although i really wish i'd realized this page was i nthe HTML export before
+ // i did all that work to build page-tree.xml from the XML export
+ continue;
+ }
+
+ System.out.println("input Page URI: " + page.toURI().toString());
+ final Element pageTreePage = pageTree.getPage(page.toURI().toString());
+ final String pageName = pageTree.getPageShortName(pageTreePage);
+ final String title = pageTree.getPageTitle(pageTreePage);
+ final String permalink = pageName + ".html";
+ final File outPage = new File(outputDir, permalink);
+ System.out.println("outPage URI: " + outPage.toURI().toString());
+
+ if (outPage.exists()) {
+ throw new RuntimeException(permalink + " already exists - multiple files with same shortname: " + page + " => " + outPage);
+ }
+
+ // Confluence encodes as 0xa0.
+ // JSoup API doesn't handle this - change to space before parsing Document
+ String fileContents = readFile(page.getPath());
+ fileContents = fileContents.replace('\u00a0',' ');
+
+ // parse Confluence page
+ Document doc = Jsoup.parse(fileContents);
+ Element mainContent = doc.select("#main-content").first();
+ if (mainContent == null) {
+ throw new RuntimeException(page.getName() + " has no main-content div");
+ }
+
+ // create clean HTML page
+ Document docOut = Document.createShell(outPage.toURI().toString());
+ docOut.title(title);
+
+ addMetadata(docOut, "page-shortname", pageName);
+ addMetadata(docOut, "page-permalink", permalink);
+ for (Element kid : pageTreePage.children()) {
+ addMetadata(docOut, "page-children", pageTree.getPageShortName(kid));
+ }
+
+
+ docOut.body().appendChild(mainContent);
+ docOut.normalise();
+
+ cleanupContent(docOut);
+
+ // fix links
+ Elements elements = docOut.select("a[href]");
+ for (Element element : elements) {
+ element.attr("href", fixLink(page, pageTree, element.attr("href")));
+ }
+
+ // fix (and copy) images
+ for (Element element : docOut.select("img")) {
+ String src = element.attr("src");
+ // attachments can be referenced by other pages
+ String imagePageId = element.attr("data-linked-resource-container-id");
+ String filename = element.attr("data-linked-resource-default-alias");
+ if (null == imagePageId || null == filename ||
+ "".equals(imagePageId) || "".equals(filename)) {
+ // this some standard comfluence image, not an attacment
+ // assume it's already been copied into place, and leave the src attr alone
+ continue;
+ }
+ String imagePageShortName = pageTree.getPageShortName(pageTree.getPage
+ (Integer.valueOf(imagePageId)));
+
+ // copy the file to the desired path if we haven't already...
+ File imagePageDir = new File(imagesDir, imagePageShortName);
+ File imageFile = new File(imagePageDir, filename);
+ if (! imageFile.exists()) {
+ File origImageFile = new File(inputDir, src);
+ if (! origImageFile.exists()) {
+ throw new RuntimeException("unable to find image: " + origImageFile + " for img in " +
+ page.toString());
+ }
+ if (! (imagePageDir.exists() || imagePageDir.mkdirs() ) ) {
+ throw new RuntimeException("unable to makedirs: " + imagePageDir + " for img: " + src +
+ " in " + page.toString());
+ }
+ Files.copy(origImageFile.toPath(), imageFile.toPath());
+ }
+
+ // rewrite the src attribute
+ element.attr("src", "images/" + imagePageShortName + "/" + filename);
+ // put each image in it's own paragragh (block type elements in adoc)
+ element.wrap("<p></p>");
+ }
+
+ // TODO: need to look for non image attachments and copy them as well
+ // ie: SVG files used to create some of these images
+
+ docOut.normalise();
+ OutputStream out = new FileOutputStream(outPage);
+ Writer writer = new OutputStreamWriter(out,"UTF-8");
+ BufferedWriter bufWriter = new BufferedWriter(writer);
+ bufWriter.write(docOut.toString());
+ bufWriter.write("\n");
+ bufWriter.close();
+ writer.close();
+ out.close();
+ }
+ }
+
+ static String readFile(String fileName) throws IOException {
+ InputStream in = new FileInputStream(fileName);
+ Reader reader = new InputStreamReader(in,"UTF-8");
+ BufferedReader br = new BufferedReader(reader);
+ try {
+ StringBuilder sb = new StringBuilder();
+ String line = br.readLine();
+ while (line != null) {
+ sb.append(line);
+ sb.append("\n");
+ line = br.readLine();
+ }
+ return sb.toString();
+ } finally {
+ br.close();
+ }
+ }
+
+ static String fixLink(File page, PageTree pageTree, final String href) {
+ try {
+ URI uri = new URI(href);
+ if (uri.isAbsolute()) {
+ // check if it's a javadoc URL and if so update to use our adoc attribute
+ final Matcher matcher = JAVADOC_URL_PATH_PATTERN.matcher(uri.getPath());
+ if (uri.getHost().equals("lucene.apache.org") && matcher.matches()) {
+ String path = matcher.group(2);
+ return (matcher.group(1).equals("core") ? "{lucene-javadocs}" : "{solr-javadocs}") + path;
+ } else {
+ return href;
+ }
+ }
+ // else: not an absoulte URL...
+
+ // any relative URL will get 'REL_LINK//' prepended so we can post-process
+ // the .adoc files to convert from the "link:xxx" syntax to the <<xxx>> syntax
+ // since pandoc doesn't have native support for that.
+ final String PRE = "REL_LINK//";
+
+ String path = uri.getPath();
+ Element linkedPage = pageTree.getPageIfMatch(path);
+
+ if ("".equals(path)) { // fragment only URL (ie: same page)
+ return PRE + fixAnchorId(href);
+ } else if (null != linkedPage) {
+ final String linkShortName = pageTree.getPageShortName(linkedPage);
+ path = linkShortName + ".adoc";
+
+ String frag = uri.getFragment();
+ if (null == frag || "".equals(frag)) {
+ // we have to have a fragment for intra-page links to work correctly in asciidoc
+ frag = linkShortName;
+ }
+ frag = fixAnchorId(frag);
+
+ // HACKish, to ensure we get clean path + ?query? + fragement
+ // (assuming we have any query parts in our realtive urls to worry about)
+ String fixed = new URI(null, null, path, uri.getQuery(), frag).toString();
+ return PRE + fixed;
+
+ } // else: no idea what this is...
+
+ System.err.println("found odd rel link: " + href + " in " + page.toString());
+ return PRE + href;
+
+
+ } catch (URISyntaxException se) {
+ System.err.println("found malformed URI " + href + " in " + page.toString());
+ // assume we should leave it alone...
+ return href;
+ }
+
+ }
+
+ static void addMetadata(Document docOut, String name, String content) {
+ Element meta = new Element(Tag.valueOf("meta"),".");
+ meta.attr("name", name);
+ meta.attr("content", content);
+ docOut.head().appendChild(meta);
+ }
+
+
+ static void cleanupContent(Document docOut) {
+ // start cleanup
+ Elements elements = null;
+
+ // remove side panels (page-internal ToCs)
+ Element sideBar = docOut.select("[data-type=aside]").first();
+ if (null == sideBar) {
+ // sometimes they aren't an 'aside', they are columns cotaining panels
+ elements = docOut.select("div.columnMacro");
+ for (Element element : elements) {
+ if (! element.select("div.toc-macro").isEmpty()) {
+ sideBar = element;
+ break;
+ }
+ }
+ }
+ if (null == sideBar) {
+ // final scnereo: toc by itself in the page body...
+ elements = docOut.select("div.toc-macro");
+ for (Element element : elements) {
+ if (! element.select("div.toc-macro").isEmpty()) {
+ sideBar = element;
+ break;
+ }
+ }
+ }
+ if (sideBar != null) {
+ // TODO: this currently replaces the entire aside/column/panel if there was one...
+ // ...would it be better to leave the other panel text and only remove the div.toc-macro?
+ // examples:
+ // Covered in this section:
+ // Topics covered in this section:
+ // Filters discussed in this section:
+ // Algorithms discussed in this section:
+
+ // NOTE: conciously choosing to completely remove the TOC, instead of adding any metadata/macros to it
+ // let the page presentation decide if/when to use a TOC...
+ //
+ sideBar.remove();
+ // sideBar.replaceWith(new TextNode("toc::[]",""));
+ // addMetadata(docOut, "toc", "true");
+
+
+ } else {
+ // sanity check if we missed any (multiple TOCs on a page?) ...
+ elements = docOut.select("div.toc-macro");
+ if (! elements.isEmpty()) {
+ System.out.println("MISSED A TOC: " + elements.toString());
+ System.exit(-1);
+ }
+ }
+
+ // unwrap various formatting tags if they are empty
+ // NOTE: explicitly not doing 'span' here because it might be used as an anchor
+ elements = docOut.select("strong, em, p, code, pre, span:not([id])");
+ for (Element element : elements) {
+ if (!element.hasText()) {
+ element.unwrap(); // unwrap not remove! (even w/o text might be inner nodes, ex: img)
+ }
+ }
+
+ // these spans aren't particularly problematic, and will largely be ignored by pandoc either way
+ // but by removing them here, they simplify some of the logic we need in other cleanup later
+ // (notably when looking for tags inside of code)
+ elements = docOut.select("span.external-link, span.nolink, span.confluence-link, code span:not([id])");
+ for (Element element : elements) {
+ element.unwrap();
+ }
+
+ // move any leading/trailing space from the leading/trailing textNodes of formatting tags
+ // out of the tags
+ // (completley removing it is dangerous because it might create run on "words")
+ for (String tag : Arrays.asList("span", "strong", "em", "code", "p")) {
+ elements = docOut.getElementsByTag(tag);
+ for (Element element : elements) {
+ // Note: not using textNodes() because our first text node may not be our first child,
+ // we don't want to munge spaces from the middle of our html if it just happens to be the
+ // first direct TextNode
+ List<Node> kids = element.childNodes();
+ if (! kids.isEmpty()) {
+ if (kids.get(0) instanceof TextNode) {
+ TextNode t = (TextNode) kids.get(0);
+ Matcher m = LEADING_SPACE_PATTERN.matcher(t.text());
+ if (m.matches()) {
+ t.text(m.replaceAll(""));
+ element.before(" ");
+ }
+ }
+ if (kids.get(kids.size()-1) instanceof TextNode) {
+ TextNode t = (TextNode) kids.get(kids.size()-1);
+ Matcher m = TRAILING_SPACE_PATTERN.matcher(t.text());
+ if (m.matches()) {
+ t.text(m.replaceAll(""));
+ element.after(" ");
+ }
+ }
+ }
+ }
+ }
+
+ // this is totally bogus, and yet confluence is doing this...
+ elements = docOut.select("code code");
+ for (Element element : elements) {
+ element.unwrap();
+ }
+
+ // fake out pandoc when an em or strong tag is inside of a code tag
+ elements = docOut.select("code strong");
+ for (Element element : elements) {
+ element.prependText("**");
+ element.appendText("**");
+ element.unwrap();
+ }
+ elements = docOut.select("code em");
+ for (Element element : elements) {
+ element.prependText("__");
+ element.appendText("__");
+ element.unwrap();
+ }
+
+ // in asciidoc, links can wrap code, but code can not wrap links
+ // so we need to invert the relationship if/when we find it...
+ elements = docOut.select("code > a:only-child");
+ for (Element element : elements) {
+ Element code = element.parent();
+ String href= element.attr("href");
+ element.unwrap();
+ if (! href.equals(code.text())) {
+ // if the entire code block is a URL, we don't need to wrap it in another link
+ // asciidoctor will take care of that for us.
+ code.wrap("<a href=\""+href+"\"></a>");
+ }
+ }
+
+ // remove confluence styles
+ elements = docOut.select("[style]");
+ for (Element element : elements) {
+ element.removeAttr("style");
+ }
+ // remove confluence themes from <pre> tags
+ elements = docOut.getElementsByTag("pre");
+ for (Element element : elements) {
+ if (element.hasAttr("class")) {
+ Matcher codeType = PRE_CODE_CLASS_PATTERN.matcher(element.attr("class"));
+ if (codeType.find()) {
+ String codeClass = codeType.group(1);
+ // some munging needed in some cases...
+ if (codeClass.equals("html/xml")) {
+ codeClass = "xml";
+ }
+ if (codeClass.equals("js")) {
+ // almost no javascript in ref guide, assume it should really be json
+ codeClass = "json";
+ }
+ if (element.text().startsWith("curl ")) {
+ // if this looks like a curl command, then ignore whatever class might have been in
+ // confluence and treat it as bash
+ codeClass = "bash";
+ }
+ // TODO: other values we should also change here? "powershell" ?
+ element.attr("class", codeClass);
+ } else {
+ element.removeAttr("class");
+ }
+ }
+ }
+
+ // confluence has a nasty habbit of (sometimes) putting named anchors people explicitly define
+ // *inside* a header, instead of around/before it.
+ // so if we find any of these, we need to rearange some things to work around somr problems...
+ // https://github.com/asciidoctor/asciidoctor/issues/1875
+ //
+ // NOTE: just moving an explicit anchor before the header should work, but because of how id's on headers
+ // are treated in asciidoc, and some weirdness in how asciidoctor treats multiple anchors
+ // delcared in front of a header this causes all but one of the anchors to be ignored...
+ //
+ // https://github.com/asciidoctor/asciidoctor/issues/1874
+ //
+ // because of this, we'll use the "explicitly" defined ancor macro from confluence as our "main"
+ // id for the header, and move the existing header id to it's own declaration.
+ //
+ // that should result in both still existing in the final adoc file (so they are easy to grep for)
+ // but the one that is most likely to have links to it will be the one used by default in generated html.
+ for (int level = 1; level < 7; level++) {
+ final String h = "h" + level;
+ elements = docOut.getElementsByTag(h);
+ for (Element header : elements) {
+ // first see if we are immediately preceeded by an explicit anchor macro...
+ // (any wrapping <p> tags should have already been uprapped for us)
+ Element previous = header.previousElementSibling();
+ if (null != previous && "span".equals(previous.tagName()) && previous.classNames().contains("confluence-anchor-link")) {
+ // swap the id from this "previous" macro declaration with the "id" of the as our header
+ final String oldId = header.attr("id");
+ header.attr("id", previous.attr("id"));
+ previous.attr("id", oldId);
+ }
+
+ // next, look for any anchors declared inside the header...
+ Elements inner = header.getElementsByClass("confluence-anchor-link");
+ for (Element anchor : inner) {
+ final String oldId = header.attr("id");
+ header.attr("id", anchor.attr("id"));
+ if (null != oldId) {
+ // flip id and move the anchor before the header
+ anchor.attr("id", oldId);
+ header.before(anchor);
+ } else {
+ // just remove the anchor completley
+ // (don't think this code path is possible, but including for completeness)
+ anchor.remove();
+ }
+ }
+ }
+ }
+
+ // replace icon text
+ elements = docOut.getElementsByClass("confluence-information-macro");
+ for (Element element : elements) {
+ final String admonishment = getAdmonishment(element);
+ Elements titles = element.select(".title");
+ if (1 < titles.size()) {
+ System.err.println("admonishment macro has more then 1 title: " + element.outerHtml());
+ System.exit(-1);
+ }
+
+ // it's easier to post-process this, then to try and fight the html->pandoc->adoc conversion
+ for (Element title : titles) { // only one, loop is easy
+ title.prependText("TODO_ADMON_TITLE:");
+ element.before(title); // move it before the block
+ }
+ element.prependChild((new Element(Tag.valueOf("p"), ".")).prependText("[" + admonishment + "]===="));
+ element.appendChild((new Element(Tag.valueOf("p"), ".")).prependText("===="));
+ }
+
+ // unwrap various block tags if they are empty
+ for (String tag : Arrays.asList("div","tbody")) {
+ elements = docOut.getElementsByTag(tag);
+ for (Element element : elements) {
+ element.unwrap(); // unwrap not remove! (might be inner nodes, ex: img)
+ }
+ }
+
+ // remove breaks -- TODO: why?
+ elements = docOut.getElementsByTag("br");
+ for (Element element : elements) {
+ element.remove();
+ }
+
+ // work around https://github.com/asciidoctor/asciidoctor/issues/1873
+ elements = docOut.select("[id]");
+ for (Element element : elements) {
+ final String oldId = element.attr("id");
+ final String newId = fixAnchorId(oldId);
+ if (! oldId.equals(newId)) {
+ // would love to use jsoup's Comment class, but it doesn't survive pandoc
+ // ironically, this does...
+ Element fakeComment = new Element(Tag.valueOf("div"), "");
+ fakeComment.text("// OLD_CONFLUENCE_ID: " + oldId);
+ element.before(fakeComment);
+ element.attr("id", newId);
+ }
+ }
+
+ // pandoc gets really confused when <ol>s get nested, add a comment pointing out
+ // manual cleanup is needed
+ elements = docOut.select("ol:has(ol, ul), ul:has(ol)");
+ LIST: for (Element list : elements) {
+ // if we are wrapped in an outer list, nothing to do - already done at top level
+ for (Element parent : list.parents()) {
+ if ("ol".equals(parent.tagName()) || "ul".equals(parent.tagName())) {
+ continue LIST;
+ }
+ }
+ // would love to use jsoup's Comment class, but it doesn't survive pandoc
+ // ironically, this does...
+ Element fakeComment = new Element(Tag.valueOf("div"), "");
+ fakeComment.text("// TODO: This '"+list.tagName()+"' has problematic nested lists inside of it, needs manual editing");
+ list.before(fakeComment);
+ }
+
+ // table cells containing structural elements are problematic in PDFs...
+ elements = docOut.select("table:has(ol, ul, p ~ p, div, pre, table)");
+ TABLE: for (Element table : elements) {
+ // if we are wrapped in another table, nothing to do - already done at top level
+ for (Element parent : table.parents()) {
+ if ("table".equals(parent.tagName())) {
+ continue TABLE;
+ }
+ }
+ // would love to use jsoup's Comment class, but it doesn't survive pandoc
+ // ironically, this does...
+ Element fakeComment = new Element(Tag.valueOf("div"), "");
+ fakeComment.text("// TODO: This table has cells that won't work with PDF: https://github.com/ctargett/refguide-asciidoc-poc/issues/13");
+ table.before(fakeComment);
+ }
+
+ // final modification: get rid of any leading spaces in paragraphs
+ // (otherwise asciidoctor will treat them as a type of code formatting
+ elements = docOut.select("p > span:not([id]):first-child");
+ for (Element element : elements) {
+ if (ONLY_SPACE_PATTERN.matcher(element.html()).matches()) {
+ element.remove();
+ }
+ }
+
+ // in general, pandoc/asciidoctor has problems with tags inside of "code" so log if we have anything
+ elements = docOut.select("code:has(*)");
+ for (Element element : elements) {
+ System.out.println("NOTE: code tag w/nested tags: " + element.outerHtml());
+ }
+
+
+ docOut.normalise();
+ }
+
+ /**
+ * work around https://github.com/asciidoctor/asciidoctor/issues/1873
+ * needs to be called on all "id" attributes, as well as any anchor text in (local) links
+ */
+ public static String fixAnchorId(String id) {
+ Matcher m = ANCHOR_ID_CLEANER.matcher(id);
+ return m.replaceAll("_");
+ }
+
+ /**
+ * convert confluence admonishment macor types to the "equivilent" adoc types we want to use
+ */
+ public static String getAdmonishment(Element e) {
+ String admon = null;
+ if (e.hasClass("confluence-information-macro-information")) {
+ return "NOTE";
+ }
+ if (e.hasClass("confluence-information-macro-tip")) {
+ return "TIP";
+ }
+ if (e.hasClass("confluence-information-macro-note")) {
+ return "IMPORTANT";
+ }
+ if (e.hasClass("confluence-information-macro-warning")) {
+ return "WARNING";
+ }
+ System.err.println("No admonishment mapping for: " + e.outerHtml());
+ System.exit(-1);
+ return null;
+ }
+
+ /**
+ * Wraps a (Jsoup) "DOM" of the <code>page-tree.xml</code> file with convinience methods
+ * for getting the names, shortnames, and kids of various pages
+ */
+ private static final class PageTree {
+ private static final Pattern HTML_EXPORT_FILENAME = Pattern.compile("^.*?\\D?(\\d+)\\.html$");
+ private static final Pattern SHORT_NAME_CLEANER = Pattern.compile("[^a-z0-9]+");
+ // Jsoups XML parsing is easier to work with then javax, especially getById
+ private final Document dom;
+ public PageTree(File pageTreeXml) throws Exception {
+ try (FileInputStream fis = new FileInputStream(pageTreeXml)) {
+ this.dom = Jsoup.parse(fis, null, pageTreeXml.toURI().toString(), Parser.xmlParser());
+ }
+ }
+ public Element getPage(int id) {
+ final Element ele = dom.getElementById(""+id);
+ if (null == ele) {
+ throw new NullPointerException("can't find DOM element with id: " + id);
+ }
+ return ele;
+ }
+ public Element getPage(String htmlFilePath) {
+ Element page = getPageIfMatch(htmlFilePath);
+ if (null != page) {
+ return page;
+ } // else...
+ throw new RuntimeException("Can't match page path pattern for html path: " + htmlFilePath);
+ }
+ public Element getPageIfMatch(String htmlFilePath) {
+ if (null == htmlFilePath || 0 == htmlFilePath.length()) {
+ return null;
+ }
+ Matcher m = HTML_EXPORT_FILENAME.matcher(htmlFilePath);
+ if (m.matches()) {
+ int id = Integer.valueOf(m.group(1));
+ return getPage(id);
+ } // else...
+ return null;
+ }
+ public String getPageTitle(Element page) {
+ String title = page.attr("title");
+ if (null == title) {
+ throw new NullPointerException("Page has null title attr");
+ }
+ return title;
+ }
+ public String getPageShortName(Element page) {
+ Matcher m = SHORT_NAME_CLEANER.matcher(getPageTitle(page).toLowerCase(Locale.ROOT));
+ return m.replaceAll("-");
+ }
+ public String getPageShortName(String htmlFilePath) {
+ return getPageShortName(getPage(htmlFilePath));
+ }
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/page-hierarchy.xsl
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/page-hierarchy.xsl b/solr/confluence-export/conversion-tools/page-hierarchy.xsl
new file mode 100644
index 0000000..39cefce
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/page-hierarchy.xsl
@@ -0,0 +1,81 @@
+<xsl:stylesheet version="1.0"
+ xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
+ <!-- A style sheet that can be applied to entities.xml from a Confluence dump
+ and produces just the bare bones data about the hierarchy of pages in (the|each) space,
+ in the order they appear as children of their parent
+ -->
+
+ <xsl:output indent="yes"/>
+
+ <xsl:template match="/">
+ <confluence>
+ <xsl:apply-templates select="//object[@class='Space']"/>
+ </confluence>
+ </xsl:template>
+
+ <xsl:template match="object[@class='Space']">
+ <space>
+ <xsl:attribute name="id"><xsl:value-of select="./id/text()"/></xsl:attribute>
+ <xsl:attribute name="name"><xsl:value-of select="./property[@name='name']/text()"/></xsl:attribute>
+
+ <!-- can't just look for "pages that have no parent" because that will also match old versions.
+ (even the historical versions of pages have a status of 'current')
+ -->
+ <!--
+ So instead look for any page that is part of the space, and does have some position
+ (sort order in space), but does not have a parent
+ -->
+ <xsl:apply-templates select="//object[@class='Page'][boolean(property[@name='position']/text())][not(property[@name='parent'])][property[@name='space']/id/text()=current()/id/text()]" >
+ <!-- NOTE: sort duplicated in recursive Page template below -->
+ <xsl:sort data-type="number" order="ascending"
+ select="property[@name='position']/text()" />
+ <!-- aparently pages only have position if user has explicitly sorted?
+ otherwise it looks like they default to sort by title? -->
+ <xsl:sort data-type="text" order="ascending"
+ select="property[@name='title']/text()" />
+ </xsl:apply-templates>
+ </space>
+ </xsl:template>
+
+ <!-- NOTE: This template is recursive -->
+ <xsl:template match="object[@class='Page']">
+ <page>
+ <xsl:attribute name="id"><xsl:value-of select="./id/text()"/></xsl:attribute>
+ <xsl:attribute name="title"><xsl:value-of select="./property[@name='title']/text()"/></xsl:attribute>
+ <!-- add parent info redundently in case it's helpful -->
+ <xsl:if test="./property[@name='parent']/id">
+ <xsl:attribute name="parent"><xsl:value-of select="./property[@name='parent']/id/text()"/></xsl:attribute>
+ </xsl:if>
+
+ <!-- the sort order if explicitly set by a confluence user at some point
+ If this has never been set for a group of children, it aparently defaults to
+ sorting all those children by alpha page title
+ -->
+ <xsl:if test="./property[@name='position']/node()">
+ <xsl:attribute name="sort"><xsl:value-of select="./property[@name='position']/text()"/></xsl:attribute>
+ </xsl:if>
+
+ <!-- NOTE: doing a for-each on collection[@name='children'] won't work....
+ collection isn't sorted, need to use "position" property from the Pages themselves
+
+ <xsl:for-each select="collection[@name='children']/element[@class='Page']/id/text()">
+ <xsl:apply-templates select="//object[@class='Page'][id/text()=current()]"/>
+ </xsl:for-each>
+ -->
+
+ <!-- instead we go out and select every page that has a parent which matches our id
+ (thank god for the parent property) and (recursively) apply templates in "position" sorted order
+ -->
+ <xsl:apply-templates select="//object[@class='Page'][property[@name='parent']/id/text()=current()/id/text()]">
+ <!-- NOTE: sort duplicated in Space template above -->
+ <xsl:sort data-type="number" order="ascending"
+ select="property[@name='position']/text()" />
+ <!-- aparently pages only have position if user has explicitly sorted?
+ otherwise it looks like they default to sort by title? -->
+ <xsl:sort data-type="text" order="ascending"
+ select="property[@name='title']/text()" />
+ </xsl:apply-templates>
+ </page>
+ </xsl:template>
+ <xsl:template match="object" /><!-- No-Op for other types of objects -->
+</xsl:stylesheet>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/post-process-adocs.pl
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/post-process-adocs.pl b/solr/confluence-export/conversion-tools/post-process-adocs.pl
new file mode 100755
index 0000000..c471558
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/post-process-adocs.pl
@@ -0,0 +1,39 @@
+#!perl -i
+
+use strict;
+use warnings;
+
+while (my $line = <>) {
+ # pandoc uses '=========...' syntax for doc title, we want shorter "= TITLE" syntax
+ if (1 == $.) {
+ $line = "= $line";
+ } elsif ((2 == $.) && $line =~ /^=+$/) {
+ next; # skip this line completley
+ }
+
+ # table syntax doesn't need to be so verbose
+ $line =~ s{^\|\={3,}+$}{|===};
+
+ # fix up relative links (in place edit) -- NOTE: links to anchor in same page get '#' stripped
+ $line =~ s{link:REL_LINK//#?(.*?)\[(.*?)\]}{\<\<$1,$2\>\>}g;
+
+ # fix up javadoc links, since pandoc escapes our attribute syntax
+ $line =~ s<link:%7B(.*?)%7D><{$1}>g;
+
+ # switch all images from inline to 'block' (double colon) and put on their own line of the file
+ # TODO: any attributes we want to add to every image?
+ $line =~ s{image:(.*?)\[(.*?)\]}{image::$1\[$2\]\n}g;
+
+ # admonishments...
+ if ($line =~ s{^TODO_ADMON_TITLE:}{.}) {
+ # next line should be blank, trash it
+ my $trash = <>;
+ $trash =~ /^$/ or die "not a blank trash line: $trash";
+ }
+ $line =~ s{^(\[\w+\])====$}{$1\n====};
+
+ # fixup obviously intended quoted code (otherwise "`foo`" just gets curly quoted)
+ $line =~ s{"`(\w+)`"}{"```$1```"}g;
+
+ print $line;
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/8736246e/solr/confluence-export/conversion-tools/toAsciidoc.sh
----------------------------------------------------------------------
diff --git a/solr/confluence-export/conversion-tools/toAsciidoc.sh b/solr/confluence-export/conversion-tools/toAsciidoc.sh
new file mode 100755
index 0000000..e6238be
--- /dev/null
+++ b/solr/confluence-export/conversion-tools/toAsciidoc.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# PoC demonstration of complete migration from Confluence
+# script to walk over pagetree of cleaned-up HTML pages from Confluence
+# process html to asciidoc
+# reconvert asciidoc to html
+
+
+# parent dir of script until/unless we move it
+WORK_DIR=$(realpath -L "$(dirname $0)/../")
+
+if [ ! -d $WORK_DIR ]
+then
+ echo "$WORK_DIR does not exist (as a directory)"
+ exit -1
+fi
+
+# check that we have the expected version of pandoc
+PANDOC_VER=`pandoc --version | head -1 | cut -d' ' -f 2 | cut -d'.' -f 1-2`
+if [ $PANDOC_VER != "1.17" ]
+then
+ echo "Only tested with pandoc 1.17, you are using $PANDOC_VER"
+ exit -1
+fi
+
+PANDOC_TEMPLATE="$WORK_DIR/conversion-tools/custom.pandoc.template"
+if [ ! -e $PANDOC_TEMPLATE ]
+then
+ echo "$PANDOC_TEMPLATE does not exist"
+ exit -1
+fi
+
+HTML_DIR="$WORK_DIR/cleaned-export"
+ASCII_DIR="$WORK_DIR/../solr-ref-guide/src"
+
+rm $ASCII_DIR/*.adoc
+
+echo "Coping images..."
+rm -rf $ASCII_DIR/images
+cp -r $HTML_DIR/images $ASCII_DIR/images
+
+for x in `find $HTML_DIR -name "*.html"`
+do
+ echo $x;
+ FNAME=`echo ${x} | sed -e "s#${HTML_DIR}/##"`
+ DIRNAME=$(dirname ${FNAME})
+ mkdir -p "$ASCII_DIR/$DIRNAME"
+
+ # convert to .asciidoc format using pandoc
+ pandoc $HTML_DIR/$FNAME -f html -t asciidoc -i --parse-raw --wrap=none --standalone --atx-headers --template=$PANDOC_TEMPLATE -o ${ASCII_DIR}/${FNAME%.*}.adoc
+
+ perl "$WORK_DIR/conversion-tools/post-process-adocs.pl" ${ASCII_DIR}/${FNAME%.*}.adoc
+done;
[2/2] lucene-solr:jira/solr-10290: SOLR-10290: Add ivy and build
files, etc.
Posted by ct...@apache.org.
SOLR-10290: Add ivy and build files, etc.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/ec324b29
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/ec324b29
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/ec324b29
Branch: refs/heads/jira/solr-10290
Commit: ec324b294ce858733dd014399a27ccb2cb513def
Parents: 8736246
Author: Cassandra Targett <ct...@apache.org>
Authored: Thu Mar 16 11:51:06 2017 -0500
Committer: Cassandra Targett <ct...@apache.org>
Committed: Thu Mar 16 11:51:06 2017 -0500
----------------------------------------------------------------------
solr/solr-ref-guide/.gitignore | 1 +
solr/solr-ref-guide/README.adoc | 23 +++++
solr/solr-ref-guide/build.xml | 188 +++++++++++++++++++++++++++++++++++
solr/solr-ref-guide/ivy.xml | 10 ++
4 files changed, 222 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/.gitignore
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/.gitignore b/solr/solr-ref-guide/.gitignore
new file mode 100644
index 0000000..567609b
--- /dev/null
+++ b/solr/solr-ref-guide/.gitignore
@@ -0,0 +1 @@
+build/
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/README.adoc
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/README.adoc b/solr/solr-ref-guide/README.adoc
new file mode 100644
index 0000000..dcca883
--- /dev/null
+++ b/solr/solr-ref-guide/README.adoc
@@ -0,0 +1,23 @@
+= Solr Ref Guide
+
+This is the source for the Solr Reference Guide.
+
+Raw content is stored in Asciidoc (`.adoc`) formated files in the `src/` directory.
+
+These files are processed with AsciiDoctor in 2 different ways:
+
+* Via 'Jekyll' to build an HTML browsable version of the Ref Guide
+** NOTE: This currently requires that you have already installed `ruby`, `jekyll`, and the `jekyll-asciidoc` plugin locally
+* Via `asciidoctor-ant` to build the officially released PDF version of the Ref Guide
+
+For details on building the ref guide, see `ant -p`.
+
+Key directories to be aware of:
+
+* `src` - where all human edited `*.adoc` files realted to the Guide live, as well as various configuration, theme, and template files.
+* `tools` - custom Java code for parsing metadata in our `src/*.adoc` files to produce some `_data/` files for site & pdf navigation purposes.
+* `build/content` - a copy of the `src` dir generated by ant where:
+** `*.template` files are processed to replace ant properties with their runtime values
+** some `build/content/_data` files are generated by our java tools based header attributes from each of the `*.adoc` files
+* `build/html-site` - HTML generated version of the ref guide
+* `build/apache-solr-ref-guide-X.Y.pdf` - PDF generated version of the ref guide
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/build.xml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/build.xml b/solr/solr-ref-guide/build.xml
new file mode 100644
index 0000000..444db44
--- /dev/null
+++ b/solr/solr-ref-guide/build.xml
@@ -0,0 +1,188 @@
+<project xmlns:asciidoctor="antlib:org.asciidoctor.ant" xmlns:ivy="antlib:org.apache.ivy.ant" >
+
+ <!-- TODO: once we're part of the lucene-solr git repo,
+ remove these lines and replace them with an import of common-build.xml -->
+ <property name="version.base" value="6.2.42" />
+ <tstamp>
+ <format property="current.year" pattern="yyyy"/>
+ <format property="DSTAMP" pattern="yyyy-MM-dd"/>
+ </tstamp>
+ <property name="ivy.sync" value="true"/>
+ <condition property="ivy.symlink">
+ <os family="unix"/>
+ </condition>
+ <!-- <import file="....../common-build.xml"/> -->
+
+
+
+
+ <!-- properties to use in our docs -->
+ <loadresource property="solr-docs-version">
+ <propertyresource name="version.base"/>
+ <filterchain>
+ <tokenfilter>
+ <filetokenizer/>
+ <replaceregex pattern="^(\d+\.\d+)(|\..*)$" replace="\1" flags="s"/>
+ </tokenfilter>
+ </filterchain>
+ </loadresource>
+ <loadresource property="solr-docs-version-path">
+ <propertyresource name="solr-docs-version"/>
+ <filterchain>
+ <tokenfilter>
+ <filetokenizer/>
+ <replaceregex pattern="^(\d+)\.(\d+)(|\..*)$" replace="\1_\2_0" flags="s"/>
+ </tokenfilter>
+ </filterchain>
+ </loadresource>
+ <property name="solr-javadocs" value="https://lucene.apache.org/solr/${solr-docs-version-path}/" />
+ <property name="lucene-javadocs" value="https://lucene.apache.org/core/${solr-docs-version-path}/" />
+
+ <target name="resolve">
+ <mkdir dir="build/lib"/>
+ <ivy:retrieve type="jar,bundle" sync="${ivy.sync}" log="download-only" symlink="${ivy.symlink}"
+ pattern="build/lib/[artifact]-[revision].[ext]" />
+ </target>
+
+ <property name="build.content.dir" location="build/content" />
+ <property name="main-page" value="apache-solr-reference-guide" />
+ <property name="pdf-filename" value="apache-solr-ref-guide-${solr-docs-version}.pdf" />
+
+ <!-- ====== TOOLS FOR GENERATING/VALIDATING BITS OF THE SITE / PDF ======= -->
+ <property name="tools-jar-name" value="solr-ref-guide-tools.jar" />
+ <path id="tools-compile-classpath">
+ <fileset dir="build/lib">
+ <include name="**/*.jar"/>
+ <exclude name="**/${tools-jar-name}" />
+ </fileset>
+ </path>
+ <path id="tools-run-classpath">
+ <fileset dir="build/lib">
+ <include name="**/*.jar"/>
+ </fileset>
+ </path>
+
+ <target name="clean">
+ <delete dir="build"/>
+ </target>
+
+ <target name="build-tools-jar" depends="resolve" description="Builds the custom java tools use use for generating some data files from page metdata">
+ <mkdir dir="build/classes"/>
+ <javac debug="yes"
+ debuglevel="source,lines,vars"
+ destdir="build/classes"
+ includeantruntime="false">
+ <compilerarg value="-Xlint:all"/>
+ <classpath refid="tools-compile-classpath"/>
+ <src path="tools/"/>
+ </javac>
+ <jar destfile="build/lib/${tools-jar-name}">
+ <fileset dir="build/classes"
+ includes="**/*.class"/>
+ </jar>
+ </target>
+
+ <target name="build-init" description="Prepares the build/content dir, copying over src files and transforming *.template files in the process">
+ <delete dir="${build.content.dir}" />
+ <mkdir dir="${build.content.dir}" />
+ <echo>Copying all non template files from src ...</echo>
+ <copy todir="${build.content.dir}">
+ <fileset dir="src">
+ <exclude name="**/*.template"/>
+ </fileset>
+ </copy>
+ <echo>Copy (w/prop replacement) any template files from src...</echo>
+ <copy todir="${build.content.dir}">
+ <fileset dir="src">
+ <include name="**/*.template"/>
+ </fileset>
+ <mapper type="glob" from="*.template" to="*"/>
+ <filterchain>
+ <expandproperties/>
+ </filterchain>
+ </copy>
+ </target>
+
+ <target name="build-nav-data-files" depends="build-init,build-tools-jar" description="creates nav based data files needed by both the html and pdf artifacts">
+ <mkdir dir="${build.content.dir}/_data"/>
+ <java classname="BuildNavAndPDFBody"
+ failonerror="true"
+ fork="true">
+ <classpath refid="tools-run-classpath"/>
+ <arg value="${build.content.dir}"/>
+ <arg value="${main-page}"/>
+ </java>
+ </target>
+
+ <target name="check-links-and-anchors" depends="build-init,build-tools-jar" description="Parse the HTML site files to check for problematic links or anchors">
+ <java classname="CheckLinksAndAnchors"
+ failonerror="true"
+ fork="true">
+ <classpath refid="tools-run-classpath"/>
+ <arg value="${basedir}/build/html-site"/>
+ </java>
+ </target>
+
+ <!-- ====== PDF Build ======= -->
+ <target name="build-pdf"
+ depends="build-nav-data-files,resolve"
+ description="Builds a PDF">
+ <taskdef uri="antlib:org.asciidoctor.ant" resource="org/asciidoctor/ant/antlib.xml"
+ classpathref="tools-run-classpath"/>
+ <asciidoctor:convert
+ sourceDirectory="${build.content.dir}/pdf"
+ sourceDocumentName="SolrRefGuide-all.adoc"
+ baseDir="${build.content.dir}"
+ outputDirectory="build"
+ backend="pdf"
+ extensions="adoc"
+ sourceHighlighter="coderay"
+ embedAssets="true"
+ imagesDir="${build.content.dir}"
+ doctype="book"
+ safemode="unsafe">
+ <attribute key="icons" value="font" />
+ <attribute key="icon-set" value="fa" />
+ <attribute key="docinfo!" value='' />
+ <attribute key="pdf-stylesDir" value="./pdf/themes"/>
+ <attribute key="pdf-style" value="refguide"/>
+ <attribute key="pdf-fontsDir" value="./pdf/fonts"/>
+ <attribute key="pagenums" value='' />
+ <attribute key="figure-caption!" value='' />
+ <attribute key="idprefix" value='' />
+ <attribute key="idseparator" value='-' />
+ <!-- attributes used in adoc files -->
+ <!-- NOTE: If you add any attributes here for use in adoc files, you almost certainly need to also add
+ them to the _config.yml.template file for building the jekyll site as well
+ -->
+ <attribute key="solr-docs-version" value="${solr-docs-version}" />
+ <attribute key="solr-javadocs" value="${solr-javadocs}" />
+ <attribute key="lucene-javadocs" value="${lucene-javadocs}" />
+ <attribute key="build-date" value="${DSTAMP}" />
+ <attribute key="build-year" value="${current.year}" />
+ </asciidoctor:convert>
+ <move file="build/SolrRefGuide-all.pdf" tofile="build/${pdf-filename}" />
+ <echo>Finished Building ${basedir}/build/${pdf-filename}</echo>
+ </target>
+
+ <!-- ======= HTML Site Build =======
+ Builds site with Jekyll.
+ This (for now) assumes that Jekyll (http://jekyllrb.com) is installed locally. -->
+ <target name="build-site"
+ depends="build-init,build-nav-data-files"
+ description="Builds an HTML Site w/Jekyll">
+ <echo>Running Jekyll...</echo>
+ <exec executable="jekyll" dir="${build.content.dir}">
+ <arg value="build"/>
+ </exec>
+ <echo>Ready to browse site: ${basedir}/build/html-site/${main-page}.html</echo>
+ </target>
+
+ <target name="build"
+ description="Builds both a PDF and HTML versions of the ref guide"
+ depends="build-pdf,build-site">
+ <echo>PDF: ${basedir}/build/${pdf-filename}</echo>
+ <echo>SITE: ${basedir}/build/html-site/${main-page}.html</echo>
+ </target>
+
+</project>
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/ec324b29/solr/solr-ref-guide/ivy.xml
----------------------------------------------------------------------
diff --git a/solr/solr-ref-guide/ivy.xml b/solr/solr-ref-guide/ivy.xml
new file mode 100644
index 0000000..f625aed
--- /dev/null
+++ b/solr/solr-ref-guide/ivy.xml
@@ -0,0 +1,10 @@
+<ivy-module version="2.0">
+ <info organisation="org.apache.solr" module="ref-guide-tools"/>
+ <dependencies>
+ <!-- v1.5.3 of asciidoctor-ant includes asciidoctorj-pdf 1.5.0-alpha.11,
+ which is the same as asciidoctor-pdf 1.5.0-alpha.11 -->
+ <dependency org="org.asciidoctor" name="asciidoctor-ant" rev="1.5.3" />
+ <dependency org="org.json" name="json" rev="20160810" />
+ <dependency org="org.jsoup" name="jsoup" rev="1.8.2" />
+ </dependencies>
+</ivy-module>