You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by tv...@apache.org on 2013/03/05 23:42:12 UTC
[1/50] [abbrv] git commit: [#5870] Make wiki2markdown a ScriptTask

[#5870] Make wiki2markdown a ScriptTask


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/adaf6371
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/adaf6371
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/adaf6371

Branch: refs/heads/si/5453
Commit: adaf6371196f4abb1050a506537ffa14f0528e66
Parents: 5f89e92
Author: Tim Van Steenburgh <tv...@gmail.com>
Authored: Thu Feb 28 17:03:56 2013 +0000
Committer: Dave Brondsema <db...@geek.net>
Committed: Thu Feb 28 22:04:48 2013 +0000

----------------------------------------------------------------------
 ForgeWiki/forgewiki/command/__init__.py            |    1 -
 ForgeWiki/forgewiki/command/base.py                |    4 -
 .../forgewiki/command/wiki2markdown/__init__.py    |   94 -------
 .../forgewiki/command/wiki2markdown/extractors.py  |  178 --------------
 .../forgewiki/command/wiki2markdown/loaders.py     |  191 ---------------
 .../forgewiki/scripts/wiki2markdown/__init__.py    |    1 +
 .../forgewiki/scripts/wiki2markdown/extractors.py  |  171 +++++++++++++
 .../forgewiki/scripts/wiki2markdown/loaders.py     |  182 ++++++++++++++
 .../scripts/wiki2markdown/wiki2markdown.py         |  109 ++++++++
 ForgeWiki/forgewiki/tests/test_wiki2markdown.py    |    4 +-
 10 files changed, 465 insertions(+), 470 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/command/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/__init__.py b/ForgeWiki/forgewiki/command/__init__.py
deleted file mode 100644
index f311f7d..0000000
--- a/ForgeWiki/forgewiki/command/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from wiki2markdown import Wiki2MarkDownCommand

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/command/base.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/base.py b/ForgeWiki/forgewiki/command/base.py
deleted file mode 100644
index 4634e11..0000000
--- a/ForgeWiki/forgewiki/command/base.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from allura.command.base import Command
-
-class WikiCommand(Command):
-    group_name = 'ForgeWiki'

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py b/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
deleted file mode 100644
index 051ce83..0000000
--- a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from allura.command import base as allura_base
-from allura.lib import helpers as h
-
-from forgewiki.command.base import WikiCommand
-from forgewiki.command.wiki2markdown.extractors import MySQLExtractor
-from forgewiki.command.wiki2markdown.loaders import MediawikiLoader
-
-class Wiki2MarkDownCommand(WikiCommand):
-    """Import MediaWiki to Allura Wiki tool"""
-    min_args = 1
-    max_args = None
-    summary = 'Import wiki from mediawiki-dump to allura wiki'
-
-    parser = WikiCommand.standard_parser(verbose=True)
-    parser.add_option('-e', '--extract-only', action='store_true',
-                      dest='extract',
-                      help='Store data from the mediawiki-dump '
-                      'on the local filesystem; not load into Allura')
-    parser.add_option('-l', '--load-only', action='store_true', dest='load',
-                help='Load into Allura previously-extracted data')
-    parser.add_option('-d', '--dump-dir', dest='dump_dir', default='',
-                help='Directory for dump files')
-    parser.add_option('-n', '--neighborhood', dest='nbhd', default='',
-                help='Neighborhood name to load data')
-    parser.add_option('-p', '--project', dest='project', default='',
-                help='Project shortname to load data into')
-    parser.add_option('-a', '--attachments-dir', dest='attachments_dir',
-                help='Path to directory with mediawiki attachments dump',
-                default='')
-
-    parser.add_option('--db_config_prefix', dest='db_config_prefix',
-                      help='Key prefix (e.g. "legacy.") in ini file to use instead of commandline db params')
-
-    parser.add_option('-s', '--source', dest='source', default='',
-                help='Database type to extract from (only mysql for now)')
-    parser.add_option('--db_name', dest='db_name', default='mediawiki',
-                help='Database name')
-    parser.add_option('--host', dest='host', default='localhost',
-                help='Database host')
-    parser.add_option('--port', dest='port', type='int', default=0,
-                help='Database port')
-    parser.add_option('--user', dest='user', default='',
-                help='User for database connection')
-    parser.add_option('--password', dest='password', default='',
-                help='Password for database connection')
-
-
-    def command(self):
-        self.basic_setup()
-        self.handle_options()
-
-        if self.options.extract:
-            self.extractor.extract()
-        if self.options.load:
-            self.loader = MediawikiLoader(self.options)
-            self.loader.load()
-
-    def handle_options(self):
-        if not self.options.dump_dir:
-            allura_base.log.error('You must specify directory for dump files')
-            exit(2)
-
-        if not self.options.extract and not self.options.load:
-            # if action doesn't specified - do both
-            self.options.extract = True
-            self.options.load = True
-
-        if self.options.load and (not self.options.project
-                                  or not self.options.nbhd):
-            allura_base.log.error('You must specify neighborhood and project '
-                                  'to load data')
-            exit(2)
-
-        if self.options.extract:
-            if self.options.db_config_prefix:
-                for k, v in h.config_with_prefix(self.config, self.options.db_config_prefix).iteritems():
-                    if k == 'port':
-                        v = int(v)
-                    setattr(self.options, k, v)
-
-            if self.options.source == 'mysql':
-                self.extractor = MySQLExtractor(self.options)
-            elif self.options.source in ('sqlite', 'postgres', 'sql-dump'):
-                allura_base.log.error('This source not implemented yet.'
-                                      'Only mysql for now')
-                exit(2)
-            else:
-                allura_base.log.error('You must specify valid data source')
-                exit(2)
-
-            if not self.options.attachments_dir:
-                allura_base.log.error('You must specify path to directory '
-                                      'with mediawiki attachmets dump.')
-                exit(2)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py b/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
deleted file mode 100644
index 17c1940..0000000
--- a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import os
-import shutil
-import json
-import hashlib
-
-from allura.command import base as allura_base
-
-
-class MediawikiExtractor(object):
-    """Base class for MediaWiki data provider"""
-
-    def __init__(self, options):
-        self.options = options
-        if os.path.exists(self.options.dump_dir):
-            # clear dump_dir before extraction (there may be an old data)
-            shutil.rmtree(self.options.dump_dir)
-        os.makedirs(self.options.dump_dir)
-
-    def extract(self):
-        """Extract pages with history, attachments, talk-pages, etc"""
-        raise NotImplementedError("subclass must override this")
-
-
-class MySQLExtractor(MediawikiExtractor):
-    """Extract MediaWiki data to json.
-
-    Use connection to MySQL database as a data source.
-    """
-
-    def __init__(self, options):
-        super(MySQLExtractor, self).__init__(options)
-        self._connection = None
-        self.db_options = {
-            'host': self.options.host or 'localhost',
-            'user': self.options.user,
-            'passwd': self.options.password,
-            'db': self.options.db_name,
-            'port': self.options.port or 3306
-        }
-
-    def connection(self):
-        try:
-            import MySQLdb
-        except ImportError:
-            raise ImportError('GPL library MySQL-python is required for this operation')
-
-        if not self._connection:
-            try:
-                self._connection = MySQLdb.connect(**self.db_options)
-            except MySQLdb.DatabaseError, e:
-                allura_base.log.error("Can't connect to database: %s" % str(e))
-                exit(2)
-        return self._connection
-
-    def _save(self, content, *paths):
-        """Save json to file in local filesystem"""
-        out_file = os.path.join(self.options.dump_dir, *paths)
-        if not os.path.exists(os.path.dirname(out_file)):
-            os.makedirs(os.path.dirname(out_file))
-        with open(out_file, 'w') as out:
-            out.write(content.encode('utf-8'))
-
-    def _save_attachment(self, filepath, *paths):
-        """Save attachment in dump directory.
-
-        Copy from mediawiki dump directory to our internal dump directory.
-
-        args:
-        filepath - path to attachment in mediawiki dump.
-        *paths - path to internal dump directory.
-        """
-        out_dir = os.path.join(self.options.dump_dir, *paths)
-        if not os.path.exists(out_dir):
-            os.makedirs(out_dir)
-        shutil.copy(filepath, out_dir)
-
-    def _pages(self):
-        """Yield page_data for next wiki page"""
-        c = self.connection().cursor()
-        c.execute('select page.page_id, page.page_title '
-                  'from page where page.page_namespace = 0')
-        for row in c:
-            _id, title = row
-            page_data = {
-                'page_id': _id,
-                'title': title,
-            }
-            yield page_data
-
-    def _history(self, page_id):
-        """Yield page_data for next revision of wiki page"""
-        c = self.connection().cursor()
-        c.execute('select revision.rev_timestamp, text.old_text, '
-                  'revision.rev_user_text '
-                  'from revision '
-                  'left join text on revision.rev_text_id = text.old_id '
-                  'where revision.rev_page = %s', page_id)
-        for row in c:
-            timestamp, text, username = row
-            page_data = {
-                'timestamp': timestamp,
-                'text': text or '',
-                'username': username
-            }
-            yield page_data
-
-    def _talk(self, page_title):
-        """Return page_data for talk page with `page_title` title"""
-        c = self.connection().cursor()
-        query_attrs = (page_title, 1)  # page_namespace == 1 - talk pages
-        c.execute('select text.old_text, revision.rev_timestamp, '
-                  'revision.rev_user_text '
-                  'from page '
-                  'left join revision on revision.rev_id = page.page_latest '
-                  'left join text on text.old_id = revision.rev_text_id '
-                  'where page.page_title = %s and page.page_namespace = %s '
-                  'limit 1', query_attrs)
-
-        row = c.fetchone()
-        if row:
-            text, timestamp, username = row
-            return {'text': text, 'timestamp': timestamp, 'username': username}
-
-    def _attachments(self, page_id):
-        """Yield path to next file attached to wiki page"""
-        c = self.connection().cursor()
-        c.execute('select il_to from imagelinks '
-                  'where il_from = %s' % page_id)
-        for row in c:
-            name = row[0]
-            # mediawiki stores attachmets in subdirectories
-            # based on md5-hash of filename
-            # so we need to build path to file as follows
-            md5 = hashlib.md5(name).hexdigest()
-            path = os.path.join(self.options.attachments_dir,
-                               md5[:1], md5[:2], name)
-            if os.path.isfile(path):
-                yield path
-
-    def extract(self):
-        self.extract_pages()
-
-    def extract_pages(self):
-        allura_base.log.info('Extracting pages...')
-        for page in self._pages():
-            self.extract_history(page)
-            self.extract_talk(page)
-            self.extract_attachments(page)
-        allura_base.log.info('Extracting pages done')
-
-    def extract_history(self, page):
-        page_id = page['page_id']
-        for page_data in self._history(page_id):
-            page_data.update(page)
-            self._save(json.dumps(page_data), 'pages', str(page_id),
-                       'history', str(page_data['timestamp']) + '.json')
-        allura_base.log.info('Extracted history for page %s (%s)'
-                             % (page_id, page['title']))
-
-    def extract_talk(self, page):
-        page_id = page['page_id']
-        talk_page_data = self._talk(page['title'])
-        if talk_page_data:
-            self._save(json.dumps(talk_page_data), 'pages', str(page_id),
-                       'discussion.json')
-            allura_base.log.info('Extracted talk for page %s (%s)'
-                                 % (page_id, page['title']))
-
-        allura_base.log.info('No talk for page %s (%s)'
-                             % (page_id, page['title']))
-
-    def extract_attachments(self, page):
-        page_id = page['page_id']
-        for filepath in self._attachments(page_id):
-            self._save_attachment(filepath, 'pages', str(page_id),
-                                  'attachments')
-        allura_base.log.info('Extracted attachments for page %s (%s)'
-                             % (page_id, page['title']))

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py b/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
deleted file mode 100644
index d7a3ce4..0000000
--- a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import os
-import json
-import datetime
-from pylons import tmpl_context as c
-from ming.orm.ormsession import ThreadLocalORMSession
-
-from allura import model as M
-from forgewiki import model as WM
-from forgewiki.converters import mediawiki2markdown
-from forgewiki.converters import mediawiki_internal_links2markdown
-from allura.command import base as allura_base
-from allura.lib import helpers as h
-from allura.lib import utils
-from allura.model.session import artifact_orm_session
-
-
-class MediawikiLoader(object):
-    """Load MediaWiki data from json to Allura wiki tool"""
-    TIMESTAMP_FMT = '%Y%m%d%H%M%S'
-
-    def __init__(self, options):
-        self.options = options
-        self.nbhd = M.Neighborhood.query.get(name=options.nbhd)
-        if not self.nbhd:
-            allura_base.log.error("Can't find neighborhood with name %s"
-                                  % options.nbhd)
-            exit(2)
-        self.project = M.Project.query.get(shortname=options.project,
-                                           neighborhood_id=self.nbhd._id)
-        if not self.project:
-            allura_base.log.error("Can't find project with shortname %s "
-                                  "and neighborhood_id %s"
-                                  % (options.project, self.nbhd._id))
-            exit(2)
-
-        self.wiki = self.project.app_instance('wiki')
-        if not self.wiki:
-            allura_base.log.error("Can't find wiki app in given project")
-            exit(2)
-
-        h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
-        self.project.notifications_disabled = True
-
-    def exit(self, status):
-        self.project.notifications_disabled = False
-        ThreadLocalORMSession.flush_all()
-        ThreadLocalORMSession.close_all()
-        exit(status)
-
-    def load(self):
-        artifact_orm_session._get().skip_mod_date = True
-        self.load_pages()
-        self.project.notifications_disabled = False
-        artifact_orm_session._get().skip_mod_date = False
-        ThreadLocalORMSession.flush_all()
-        ThreadLocalORMSession.close_all()
-        allura_base.log.info('Loading wiki done')
-
-    def _pages(self):
-        """Yield path to page dump directory for next wiki page"""
-        pages_dir = os.path.join(self.options.dump_dir, 'pages')
-        pages = []
-        if not os.path.isdir(pages_dir):
-            return
-        pages = os.listdir(pages_dir)
-        for directory in pages:
-            dir_path = os.path.join(pages_dir, directory)
-            if os.path.isdir(dir_path):
-                yield dir_path
-
-    def _history(self, page_dir):
-        """Yield page_data for next wiki page in edit history"""
-        page_dir = os.path.join(page_dir, 'history')
-        if not os.path.isdir(page_dir):
-            return
-        pages = os.listdir(page_dir)
-        pages.sort()  # ensure that history in right order
-        for page in pages:
-            fn = os.path.join(page_dir, page)
-            try:
-                with open(fn, 'r') as pages_file:
-                    page_data = json.load(pages_file)
-            except IOError, e:
-                allura_base.log.error("Can't open file: %s" % str(e))
-                self.exit(2)
-            except ValueError, e:
-                allura_base.log.error("Can't load data from file %s: %s"
-                                      % (fn, str(e)))
-                self.exit(2)
-            yield page_data
-
-    def _talk(self, page_dir):
-        """Return talk data from json dump"""
-        filename = os.path.join(page_dir, 'discussion.json')
-        if not os.path.isfile(filename):
-            return
-        try:
-            with open(filename, 'r') as talk_file:
-                talk_data = json.load(talk_file)
-        except IOError, e:
-            allura_base.log.error("Can't open file: %s" % str(e))
-            self.exit(2)
-        except ValueError, e:
-            allura_base.log.error("Can't load data from file %s: %s"
-                                  % (filename, str(e)))
-            self.exit(2)
-        return talk_data
-
-    def _attachments(self, page_dir):
-        """Yield (filename, full path) to next attachment for given page."""
-        attachments_dir = os.path.join(page_dir, 'attachments')
-        if not os.path.isdir(attachments_dir):
-            return
-        attachments = os.listdir(attachments_dir)
-        for filename in attachments:
-            yield filename, os.path.join(attachments_dir, filename)
-
-    def load_pages(self):
-        """Load pages with edit history from json to Allura wiki tool"""
-        allura_base.log.info('Loading pages into allura...')
-        for page_dir in self._pages():
-            for page in self._history(page_dir):
-                p = WM.Page.upsert(page['title'])
-                p.viewable_by = ['all']
-                p.text = mediawiki_internal_links2markdown(
-                            mediawiki2markdown(page['text']),
-                            page['title'])
-                timestamp = datetime.datetime.strptime(page['timestamp'],
-                                                        self.TIMESTAMP_FMT)
-                p.mod_date = timestamp
-                c.user = (M.User.query.get(username=page['username'].lower())
-                          or M.User.anonymous())
-                ss = p.commit()
-                ss.mod_date = ss.timestamp = timestamp
-
-            # set home to main page
-            if page['title'] == 'Main_Page':
-                gl = WM.Globals.query.get(app_config_id=self.wiki.config._id)
-                if gl is not None:
-                    gl.root = page['title']
-            allura_base.log.info('Loaded history of page %s (%s)'
-                                 % (page['page_id'], page['title']))
-
-            self.load_talk(page_dir, page['title'])
-            self.load_attachments(page_dir, page['title'])
-
-    def load_talk(self, page_dir, page_title):
-        """Load talk for page.
-
-        page_dir - path to directory with page dump.
-        page_title - page title in Allura Wiki
-        """
-        talk_data = self._talk(page_dir)
-        if not talk_data:
-            return
-        text = mediawiki2markdown(talk_data['text'])
-        page = WM.Page.query.get(app_config_id=self.wiki.config._id,
-                                 title=page_title)
-        if not page:
-            return
-        thread = M.Thread.query.get(ref_id=page.index_id())
-        if not thread:
-            return
-        timestamp = datetime.datetime.strptime(talk_data['timestamp'],
-                                               self.TIMESTAMP_FMT)
-        c.user = (M.User.query.get(username=talk_data['username'].lower())
-                  or M.User.anonymous())
-        thread.add_post(
-            text=text,
-            discussion_id=thread.discussion_id,
-            thread_id=thread._id,
-            timestamp=timestamp,
-            ignore_security=True)
-        allura_base.log.info('Loaded talk for page %s' % page_title)
-
-    def load_attachments(self, page_dir, page_title):
-        """Load attachments for page.
-
-        page_dir - path to directory with page dump.
-        """
-        page = WM.Page.query.get(app_config_id=self.wiki.config._id,
-                                 title=page_title)
-        for filename, path in self._attachments(page_dir):
-            try:
-                with open(path) as fp:
-                    page.attach(filename, fp,
-                                content_type=utils.guess_mime_type(filename))
-            except IOError, e:
-                allura_base.log.error("Can't open file: %s" % str(e))
-                self.exit(2)
-        allura_base.log.info('Loaded attachments for page %s.' % page_title)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/scripts/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/__init__.py b/ForgeWiki/forgewiki/scripts/__init__.py
new file mode 100644
index 0000000..e69de29

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py
new file mode 100644
index 0000000..2fe4b24
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py
@@ -0,0 +1 @@
+from wiki2markdown import Wiki2Markdown

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py
new file mode 100644
index 0000000..8d95317
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py
@@ -0,0 +1,171 @@
+import logging
+import os
+import shutil
+import json
+import hashlib
+
+log = logging.getLogger(__name__)
+
+
+class MediawikiExtractor(object):
+    """Base class for MediaWiki data provider"""
+
+    def __init__(self, options):
+        self.options = options
+        if os.path.exists(self.options.dump_dir):
+            # clear dump_dir before extraction (there may be an old data)
+            shutil.rmtree(self.options.dump_dir)
+        os.makedirs(self.options.dump_dir)
+
+    def extract(self):
+        """Extract pages with history, attachments, talk-pages, etc"""
+        raise NotImplementedError("subclass must override this")
+
+
+class MySQLExtractor(MediawikiExtractor):
+    """Extract MediaWiki data to json.
+
+    Use connection to MySQL database as a data source.
+    """
+
+    def __init__(self, options):
+        super(MySQLExtractor, self).__init__(options)
+        self._connection = None
+        self.db_options = {
+            'host': self.options.host or 'localhost',
+            'user': self.options.user,
+            'passwd': self.options.password,
+            'db': self.options.db_name,
+            'port': self.options.port or 3306
+        }
+
+    def connection(self):
+        try:
+            import MySQLdb
+        except ImportError:
+            raise ImportError('GPL library MySQL-python is required for this operation')
+
+        if not self._connection:
+            self._connection = MySQLdb.connect(**self.db_options)
+        return self._connection
+
+    def _save(self, content, *paths):
+        """Save json to file in local filesystem"""
+        out_file = os.path.join(self.options.dump_dir, *paths)
+        if not os.path.exists(os.path.dirname(out_file)):
+            os.makedirs(os.path.dirname(out_file))
+        with open(out_file, 'w') as out:
+            out.write(content.encode('utf-8'))
+
+    def _save_attachment(self, filepath, *paths):
+        """Save attachment in dump directory.
+
+        Copy from mediawiki dump directory to our internal dump directory.
+
+        args:
+        filepath - path to attachment in mediawiki dump.
+        *paths - path to internal dump directory.
+        """
+        out_dir = os.path.join(self.options.dump_dir, *paths)
+        if not os.path.exists(out_dir):
+            os.makedirs(out_dir)
+        shutil.copy(filepath, out_dir)
+
+    def _pages(self):
+        """Yield page_data for next wiki page"""
+        c = self.connection().cursor()
+        c.execute('select page.page_id, page.page_title '
+                  'from page where page.page_namespace = 0')
+        for row in c:
+            _id, title = row
+            page_data = {
+                'page_id': _id,
+                'title': title,
+            }
+            yield page_data
+
+    def _history(self, page_id):
+        """Yield page_data for next revision of wiki page"""
+        c = self.connection().cursor()
+        c.execute('select revision.rev_timestamp, text.old_text, '
+                  'revision.rev_user_text '
+                  'from revision '
+                  'left join text on revision.rev_text_id = text.old_id '
+                  'where revision.rev_page = %s', page_id)
+        for row in c:
+            timestamp, text, username = row
+            page_data = {
+                'timestamp': timestamp,
+                'text': text or '',
+                'username': username
+            }
+            yield page_data
+
+    def _talk(self, page_title):
+        """Return page_data for talk page with `page_title` title"""
+        c = self.connection().cursor()
+        query_attrs = (page_title, 1)  # page_namespace == 1 - talk pages
+        c.execute('select text.old_text, revision.rev_timestamp, '
+                  'revision.rev_user_text '
+                  'from page '
+                  'left join revision on revision.rev_id = page.page_latest '
+                  'left join text on text.old_id = revision.rev_text_id '
+                  'where page.page_title = %s and page.page_namespace = %s '
+                  'limit 1', query_attrs)
+
+        row = c.fetchone()
+        if row:
+            text, timestamp, username = row
+            return {'text': text, 'timestamp': timestamp, 'username': username}
+
+    def _attachments(self, page_id):
+        """Yield path to next file attached to wiki page"""
+        c = self.connection().cursor()
+        c.execute('select il_to from imagelinks '
+                  'where il_from = %s' % page_id)
+        for row in c:
+            name = row[0]
+            # mediawiki stores attachmets in subdirectories
+            # based on md5-hash of filename
+            # so we need to build path to file as follows
+            md5 = hashlib.md5(name).hexdigest()
+            path = os.path.join(self.options.attachments_dir,
+                               md5[:1], md5[:2], name)
+            if os.path.isfile(path):
+                yield path
+
+    def extract(self):
+        self.extract_pages()
+
+    def extract_pages(self):
+        log.info('Extracting pages...')
+        for page in self._pages():
+            self.extract_history(page)
+            self.extract_talk(page)
+            self.extract_attachments(page)
+        log.info('Extracting pages done')
+
+    def extract_history(self, page):
+        page_id = page['page_id']
+        for page_data in self._history(page_id):
+            page_data.update(page)
+            self._save(json.dumps(page_data), 'pages', str(page_id),
+                       'history', str(page_data['timestamp']) + '.json')
+        log.info('Extracted history for page %s (%s)', page_id, page['title'])
+
+    def extract_talk(self, page):
+        page_id = page['page_id']
+        talk_page_data = self._talk(page['title'])
+        if talk_page_data:
+            self._save(json.dumps(talk_page_data), 'pages', str(page_id),
+                       'discussion.json')
+            log.info('Extracted talk for page %s (%s)', page_id, page['title'])
+        else:
+            log.info('No talk for page %s (%s)', page_id, page['title'])
+
+    def extract_attachments(self, page):
+        page_id = page['page_id']
+        for filepath in self._attachments(page_id):
+            self._save_attachment(filepath, 'pages', str(page_id),
+                                  'attachments')
+        log.info('Extracted attachments for page %s (%s)', page_id, page['title'])

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py
new file mode 100644
index 0000000..00487a1
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py
@@ -0,0 +1,182 @@
+import logging
+import os
+import json
+import datetime
+from pylons import tmpl_context as c
+from ming.orm.ormsession import ThreadLocalORMSession
+
+from allura import model as M
+from forgewiki import model as WM
+from forgewiki.converters import mediawiki2markdown
+from forgewiki.converters import mediawiki_internal_links2markdown
+from allura.lib import helpers as h
+from allura.lib import utils
+from allura.model.session import artifact_orm_session
+
+log = logging.getLogger(__name__)
+
+
+class MediawikiLoader(object):
+    """Load MediaWiki data from json to Allura wiki tool"""
+    TIMESTAMP_FMT = '%Y%m%d%H%M%S'
+
+    def __init__(self, options):
+        self.options = options
+        self.nbhd = M.Neighborhood.query.get(name=options.nbhd)
+        if not self.nbhd:
+            raise ValueError("Can't find neighborhood with name %s"
+                                  % options.nbhd)
+        self.project = M.Project.query.get(shortname=options.project,
+                                           neighborhood_id=self.nbhd._id)
+        if not self.project:
+            raise ValueError("Can't find project with shortname %s "
+                                  "and neighborhood_id %s"
+                                  % (options.project, self.nbhd._id))
+
+        self.wiki = self.project.app_instance('wiki')
+        if not self.wiki:
+            raise ValueError("Can't find wiki app in given project")
+
+        h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
+
+    def load(self):
+        try:
+            self.project.notifications_disabled = True
+            artifact_orm_session._get().skip_mod_date = True
+            self.load_pages()
+            ThreadLocalORMSession.flush_all()
+            log.info('Loading wiki done')
+        finally:
+            self.project.notifications_disabled = False
+            artifact_orm_session._get().skip_mod_date = False
+
+    def _pages(self):
+        """Yield path to page dump directory for next wiki page"""
+        pages_dir = os.path.join(self.options.dump_dir, 'pages')
+        pages = []
+        if not os.path.isdir(pages_dir):
+            return
+        pages = os.listdir(pages_dir)
+        for directory in pages:
+            dir_path = os.path.join(pages_dir, directory)
+            if os.path.isdir(dir_path):
+                yield dir_path
+
+    def _history(self, page_dir):
+        """Yield page_data for next wiki page in edit history"""
+        page_dir = os.path.join(page_dir, 'history')
+        if not os.path.isdir(page_dir):
+            return
+        pages = os.listdir(page_dir)
+        pages.sort()  # ensure that history in right order
+        for page in pages:
+            fn = os.path.join(page_dir, page)
+            try:
+                with open(fn, 'r') as pages_file:
+                    page_data = json.load(pages_file)
+            except IOError, e:
+                log.error("Can't open file: %s", str(e))
+                raise
+            except ValueError, e:
+                log.error("Can't load data from file %s: %s", fn, str(e))
+                raise
+            yield page_data
+
+    def _talk(self, page_dir):
+        """Return talk data from json dump"""
+        filename = os.path.join(page_dir, 'discussion.json')
+        if not os.path.isfile(filename):
+            return
+        try:
+            with open(filename, 'r') as talk_file:
+                talk_data = json.load(talk_file)
+        except IOError, e:
+            log.error("Can't open file: %s", str(e))
+            raise
+        except ValueError, e:
+            log.error("Can't load data from file %s: %s", filename, str(e))
+            raise
+        return talk_data
+
+    def _attachments(self, page_dir):
+        """Yield (filename, full path) to next attachment for given page."""
+        attachments_dir = os.path.join(page_dir, 'attachments')
+        if not os.path.isdir(attachments_dir):
+            return
+        attachments = os.listdir(attachments_dir)
+        for filename in attachments:
+            yield filename, os.path.join(attachments_dir, filename)
+
+    def load_pages(self):
+        """Load pages with edit history from json to Allura wiki tool"""
+        log.info('Loading pages into allura...')
+        for page_dir in self._pages():
+            for page in self._history(page_dir):
+                p = WM.Page.upsert(page['title'])
+                p.viewable_by = ['all']
+                p.text = mediawiki_internal_links2markdown(
+                            mediawiki2markdown(page['text']),
+                            page['title'])
+                timestamp = datetime.datetime.strptime(page['timestamp'],
+                                                        self.TIMESTAMP_FMT)
+                p.mod_date = timestamp
+                c.user = (M.User.query.get(username=page['username'].lower())
+                          or M.User.anonymous())
+                ss = p.commit()
+                ss.mod_date = ss.timestamp = timestamp
+
+            # set home to main page
+            if page['title'] == 'Main_Page':
+                gl = WM.Globals.query.get(app_config_id=self.wiki.config._id)
+                if gl is not None:
+                    gl.root = page['title']
+            log.info('Loaded history of page %s (%s)', page['page_id'], page['title'])
+
+            self.load_talk(page_dir, page['title'])
+            self.load_attachments(page_dir, page['title'])
+
+    def load_talk(self, page_dir, page_title):
+        """Load talk for page.
+
+        page_dir - path to directory with page dump.
+        page_title - page title in Allura Wiki
+        """
+        talk_data = self._talk(page_dir)
+        if not talk_data:
+            return
+        text = mediawiki2markdown(talk_data['text'])
+        page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+                                 title=page_title)
+        if not page:
+            return
+        thread = M.Thread.query.get(ref_id=page.index_id())
+        if not thread:
+            return
+        timestamp = datetime.datetime.strptime(talk_data['timestamp'],
+                                               self.TIMESTAMP_FMT)
+        c.user = (M.User.query.get(username=talk_data['username'].lower())
+                  or M.User.anonymous())
+        thread.add_post(
+            text=text,
+            discussion_id=thread.discussion_id,
+            thread_id=thread._id,
+            timestamp=timestamp,
+            ignore_security=True)
+        log.info('Loaded talk for page %s', page_title)
+
+    def load_attachments(self, page_dir, page_title):
+        """Load attachments for page.
+
+        page_dir - path to directory with page dump.
+        """
+        page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+                                 title=page_title)
+        for filename, path in self._attachments(page_dir):
+            try:
+                with open(path) as fp:
+                    page.attach(filename, fp,
+                                content_type=utils.guess_mime_type(filename))
+            except IOError, e:
+                log.error("Can't open file: %s", str(e))
+                raise
+        log.info('Loaded attachments for page %s.', page_title)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py
new file mode 100644
index 0000000..e837f45f
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py
@@ -0,0 +1,109 @@
+import argparse
+import logging
+import shutil
+import tempfile
+
+from tg import config
+
+from allura.lib import helpers as h
+from allura.scripts import ScriptTask
+
+from forgewiki.scripts.wiki2markdown.extractors import MySQLExtractor
+from forgewiki.scripts.wiki2markdown.loaders import MediawikiLoader
+
+log = logging.getLogger(__name__)
+
+
+class Wiki2Markdown(ScriptTask):
+    """Import MediaWiki to Allura Wiki tool"""
+    @classmethod
+    def parser(cls):
+        parser = argparse.ArgumentParser(description='Import wiki from'
+            'mediawiki-dump to allura wiki')
+        parser.add_argument('-e', '--extract-only', action='store_true',
+                          dest='extract',
+                          help='Store data from the mediawiki-dump '
+                          'on the local filesystem; not load into Allura')
+        parser.add_argument('-l', '--load-only', action='store_true', dest='load',
+                    help='Load into Allura previously-extracted data')
+        parser.add_argument('-d', '--dump-dir', dest='dump_dir', default='',
+                    help='Directory for dump files')
+        parser.add_argument('-n', '--neighborhood', dest='nbhd', default='',
+                    help='Neighborhood name to load data')
+        parser.add_argument('-p', '--project', dest='project', default='',
+                    help='Project shortname to load data into')
+        parser.add_argument('-a', '--attachments-dir', dest='attachments_dir',
+                    help='Path to directory with mediawiki attachments dump',
+                    default='')
+        parser.add_argument('--db_config_prefix', dest='db_config_prefix',
+                          help='Key prefix (e.g. "legacy.") in ini file to '
+                          'use instead of commandline db params')
+        parser.add_argument('-s', '--source', dest='source', default='mysql',
+                    help='Database type to extract from (only mysql for now)')
+        parser.add_argument('--db_name', dest='db_name', default='mediawiki',
+                    help='Database name')
+        parser.add_argument('--host', dest='host', default='localhost',
+                    help='Database host')
+        parser.add_argument('--port', dest='port', type=int, default=0,
+                    help='Database port')
+        parser.add_argument('--user', dest='user', default='',
+                    help='User for database connection')
+        parser.add_argument('--password', dest='password', default='',
+                    help='Password for database connection')
+        parser.add_argument('--keep-dumps', action='store_true', dest='keep_dumps',
+                    help='Leave dump files on disk after run')
+        return parser
+
+    @classmethod
+    def execute(cls, options):
+        options = cls.handle_options(options)
+
+        try:
+            if options.extract:
+                MySQLExtractor(options).extract()
+            if options.load:
+                MediawikiLoader(options).load()
+        finally:
+            if not options.keep_dumps:
+                shutil.rmtree(options.dump_dir)
+
+    @classmethod
+    def handle_options(cls, options):
+        if not options.extract and not options.load:
+            # if action doesn't specified - do both
+            options.extract = True
+            options.load = True
+
+        if not options.dump_dir:
+            if options.load and not options.extract:
+                raise ValueError('You must specify directory containing dump files')
+            else:
+                options.dump_dir = tempfile.mkdtemp()
+                log.info("Writing temp files to %s", options.dump_dir)
+
+        if options.load and (not options.project or not options.nbhd):
+            raise ValueError('You must specify neighborhood and project '
+                                  'to load data')
+
+        if options.extract:
+            if options.db_config_prefix:
+                for k, v in h.config_with_prefix(config, options.db_config_prefix).iteritems():
+                    if k == 'port':
+                        v = int(v)
+                    setattr(options, k, v)
+
+            if options.source == 'mysql':
+                pass
+            elif options.source in ('sqlite', 'postgres', 'sql-dump'):
+                raise ValueError('This source not implemented yet. Only mysql for now')
+            else:
+                raise ValueError('You must specify a valid data source')
+
+            if not options.attachments_dir:
+                raise ValueError('You must specify path to directory with mediawiki attachmets dump.')
+
+        return options
+
+
+if __name__ == '__main__':
+    Wiki2Markdown.main()

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/adaf6371/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/tests/test_wiki2markdown.py b/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
index 7aa9a03..f647bde 100644
--- a/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
+++ b/ForgeWiki/forgewiki/tests/test_wiki2markdown.py
@@ -3,8 +3,8 @@ import json
 from datetime import datetime
 from IPython.testing.decorators import module_not_available, skipif
 
-from forgewiki.command.wiki2markdown.extractors import MySQLExtractor
-from forgewiki.command.wiki2markdown.loaders import MediawikiLoader
+from forgewiki.scripts.wiki2markdown.extractors import MySQLExtractor
+from forgewiki.scripts.wiki2markdown.loaders import MediawikiLoader
 from alluratest.controller import setup_basic_test
 from allura import model as M
 from forgewiki import model as WM