You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@allura.apache.org by tv...@apache.org on 2013/02/28 18:04:14 UTC
git commit: [#5870] Make wiki2markdown a ScriptTask
Updated Branches:
refs/heads/tv/5870 [created] 4d2f710aa
[#5870] Make wiki2markdown a ScriptTask
Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/4d2f710a
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/4d2f710a
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/4d2f710a
Branch: refs/heads/tv/5870
Commit: 4d2f710aa4ffb5de183066075feedaecc2a9090e
Parents: c847237
Author: Tim Van Steenburgh <tv...@gmail.com>
Authored: Thu Feb 28 17:03:56 2013 +0000
Committer: Tim Van Steenburgh <tv...@gmail.com>
Committed: Thu Feb 28 17:03:56 2013 +0000
----------------------------------------------------------------------
ForgeWiki/forgewiki/command/__init__.py | 1 -
ForgeWiki/forgewiki/command/base.py | 4 -
.../forgewiki/command/wiki2markdown/__init__.py | 94 -------
.../forgewiki/command/wiki2markdown/extractors.py | 178 --------------
.../forgewiki/command/wiki2markdown/loaders.py | 191 ---------------
.../forgewiki/scripts/wiki2markdown/__init__.py | 1 +
.../forgewiki/scripts/wiki2markdown/extractors.py | 171 +++++++++++++
.../forgewiki/scripts/wiki2markdown/loaders.py | 182 ++++++++++++++
.../scripts/wiki2markdown/wiki2markdown.py | 109 ++++++++
9 files changed, 463 insertions(+), 468 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/command/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/__init__.py b/ForgeWiki/forgewiki/command/__init__.py
deleted file mode 100644
index f311f7d..0000000
--- a/ForgeWiki/forgewiki/command/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-from wiki2markdown import Wiki2MarkDownCommand
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/command/base.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/base.py b/ForgeWiki/forgewiki/command/base.py
deleted file mode 100644
index 4634e11..0000000
--- a/ForgeWiki/forgewiki/command/base.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from allura.command.base import Command
-
-class WikiCommand(Command):
- group_name = 'ForgeWiki'
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py b/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
deleted file mode 100644
index 051ce83..0000000
--- a/ForgeWiki/forgewiki/command/wiki2markdown/__init__.py
+++ /dev/null
@@ -1,94 +0,0 @@
-from allura.command import base as allura_base
-from allura.lib import helpers as h
-
-from forgewiki.command.base import WikiCommand
-from forgewiki.command.wiki2markdown.extractors import MySQLExtractor
-from forgewiki.command.wiki2markdown.loaders import MediawikiLoader
-
-class Wiki2MarkDownCommand(WikiCommand):
- """Import MediaWiki to Allura Wiki tool"""
- min_args = 1
- max_args = None
- summary = 'Import wiki from mediawiki-dump to allura wiki'
-
- parser = WikiCommand.standard_parser(verbose=True)
- parser.add_option('-e', '--extract-only', action='store_true',
- dest='extract',
- help='Store data from the mediawiki-dump '
- 'on the local filesystem; not load into Allura')
- parser.add_option('-l', '--load-only', action='store_true', dest='load',
- help='Load into Allura previously-extracted data')
- parser.add_option('-d', '--dump-dir', dest='dump_dir', default='',
- help='Directory for dump files')
- parser.add_option('-n', '--neighborhood', dest='nbhd', default='',
- help='Neighborhood name to load data')
- parser.add_option('-p', '--project', dest='project', default='',
- help='Project shortname to load data into')
- parser.add_option('-a', '--attachments-dir', dest='attachments_dir',
- help='Path to directory with mediawiki attachments dump',
- default='')
-
- parser.add_option('--db_config_prefix', dest='db_config_prefix',
- help='Key prefix (e.g. "legacy.") in ini file to use instead of commandline db params')
-
- parser.add_option('-s', '--source', dest='source', default='',
- help='Database type to extract from (only mysql for now)')
- parser.add_option('--db_name', dest='db_name', default='mediawiki',
- help='Database name')
- parser.add_option('--host', dest='host', default='localhost',
- help='Database host')
- parser.add_option('--port', dest='port', type='int', default=0,
- help='Database port')
- parser.add_option('--user', dest='user', default='',
- help='User for database connection')
- parser.add_option('--password', dest='password', default='',
- help='Password for database connection')
-
-
- def command(self):
- self.basic_setup()
- self.handle_options()
-
- if self.options.extract:
- self.extractor.extract()
- if self.options.load:
- self.loader = MediawikiLoader(self.options)
- self.loader.load()
-
- def handle_options(self):
- if not self.options.dump_dir:
- allura_base.log.error('You must specify directory for dump files')
- exit(2)
-
- if not self.options.extract and not self.options.load:
- # if action doesn't specified - do both
- self.options.extract = True
- self.options.load = True
-
- if self.options.load and (not self.options.project
- or not self.options.nbhd):
- allura_base.log.error('You must specify neighborhood and project '
- 'to load data')
- exit(2)
-
- if self.options.extract:
- if self.options.db_config_prefix:
- for k, v in h.config_with_prefix(self.config, self.options.db_config_prefix).iteritems():
- if k == 'port':
- v = int(v)
- setattr(self.options, k, v)
-
- if self.options.source == 'mysql':
- self.extractor = MySQLExtractor(self.options)
- elif self.options.source in ('sqlite', 'postgres', 'sql-dump'):
- allura_base.log.error('This source not implemented yet.'
- 'Only mysql for now')
- exit(2)
- else:
- allura_base.log.error('You must specify valid data source')
- exit(2)
-
- if not self.options.attachments_dir:
- allura_base.log.error('You must specify path to directory '
- 'with mediawiki attachmets dump.')
- exit(2)
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py b/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
deleted file mode 100644
index 17c1940..0000000
--- a/ForgeWiki/forgewiki/command/wiki2markdown/extractors.py
+++ /dev/null
@@ -1,178 +0,0 @@
-import os
-import shutil
-import json
-import hashlib
-
-from allura.command import base as allura_base
-
-
-class MediawikiExtractor(object):
- """Base class for MediaWiki data provider"""
-
- def __init__(self, options):
- self.options = options
- if os.path.exists(self.options.dump_dir):
- # clear dump_dir before extraction (there may be an old data)
- shutil.rmtree(self.options.dump_dir)
- os.makedirs(self.options.dump_dir)
-
- def extract(self):
- """Extract pages with history, attachments, talk-pages, etc"""
- raise NotImplementedError("subclass must override this")
-
-
-class MySQLExtractor(MediawikiExtractor):
- """Extract MediaWiki data to json.
-
- Use connection to MySQL database as a data source.
- """
-
- def __init__(self, options):
- super(MySQLExtractor, self).__init__(options)
- self._connection = None
- self.db_options = {
- 'host': self.options.host or 'localhost',
- 'user': self.options.user,
- 'passwd': self.options.password,
- 'db': self.options.db_name,
- 'port': self.options.port or 3306
- }
-
- def connection(self):
- try:
- import MySQLdb
- except ImportError:
- raise ImportError('GPL library MySQL-python is required for this operation')
-
- if not self._connection:
- try:
- self._connection = MySQLdb.connect(**self.db_options)
- except MySQLdb.DatabaseError, e:
- allura_base.log.error("Can't connect to database: %s" % str(e))
- exit(2)
- return self._connection
-
- def _save(self, content, *paths):
- """Save json to file in local filesystem"""
- out_file = os.path.join(self.options.dump_dir, *paths)
- if not os.path.exists(os.path.dirname(out_file)):
- os.makedirs(os.path.dirname(out_file))
- with open(out_file, 'w') as out:
- out.write(content.encode('utf-8'))
-
- def _save_attachment(self, filepath, *paths):
- """Save attachment in dump directory.
-
- Copy from mediawiki dump directory to our internal dump directory.
-
- args:
- filepath - path to attachment in mediawiki dump.
- *paths - path to internal dump directory.
- """
- out_dir = os.path.join(self.options.dump_dir, *paths)
- if not os.path.exists(out_dir):
- os.makedirs(out_dir)
- shutil.copy(filepath, out_dir)
-
- def _pages(self):
- """Yield page_data for next wiki page"""
- c = self.connection().cursor()
- c.execute('select page.page_id, page.page_title '
- 'from page where page.page_namespace = 0')
- for row in c:
- _id, title = row
- page_data = {
- 'page_id': _id,
- 'title': title,
- }
- yield page_data
-
- def _history(self, page_id):
- """Yield page_data for next revision of wiki page"""
- c = self.connection().cursor()
- c.execute('select revision.rev_timestamp, text.old_text, '
- 'revision.rev_user_text '
- 'from revision '
- 'left join text on revision.rev_text_id = text.old_id '
- 'where revision.rev_page = %s', page_id)
- for row in c:
- timestamp, text, username = row
- page_data = {
- 'timestamp': timestamp,
- 'text': text or '',
- 'username': username
- }
- yield page_data
-
- def _talk(self, page_title):
- """Return page_data for talk page with `page_title` title"""
- c = self.connection().cursor()
- query_attrs = (page_title, 1) # page_namespace == 1 - talk pages
- c.execute('select text.old_text, revision.rev_timestamp, '
- 'revision.rev_user_text '
- 'from page '
- 'left join revision on revision.rev_id = page.page_latest '
- 'left join text on text.old_id = revision.rev_text_id '
- 'where page.page_title = %s and page.page_namespace = %s '
- 'limit 1', query_attrs)
-
- row = c.fetchone()
- if row:
- text, timestamp, username = row
- return {'text': text, 'timestamp': timestamp, 'username': username}
-
- def _attachments(self, page_id):
- """Yield path to next file attached to wiki page"""
- c = self.connection().cursor()
- c.execute('select il_to from imagelinks '
- 'where il_from = %s' % page_id)
- for row in c:
- name = row[0]
- # mediawiki stores attachmets in subdirectories
- # based on md5-hash of filename
- # so we need to build path to file as follows
- md5 = hashlib.md5(name).hexdigest()
- path = os.path.join(self.options.attachments_dir,
- md5[:1], md5[:2], name)
- if os.path.isfile(path):
- yield path
-
- def extract(self):
- self.extract_pages()
-
- def extract_pages(self):
- allura_base.log.info('Extracting pages...')
- for page in self._pages():
- self.extract_history(page)
- self.extract_talk(page)
- self.extract_attachments(page)
- allura_base.log.info('Extracting pages done')
-
- def extract_history(self, page):
- page_id = page['page_id']
- for page_data in self._history(page_id):
- page_data.update(page)
- self._save(json.dumps(page_data), 'pages', str(page_id),
- 'history', str(page_data['timestamp']) + '.json')
- allura_base.log.info('Extracted history for page %s (%s)'
- % (page_id, page['title']))
-
- def extract_talk(self, page):
- page_id = page['page_id']
- talk_page_data = self._talk(page['title'])
- if talk_page_data:
- self._save(json.dumps(talk_page_data), 'pages', str(page_id),
- 'discussion.json')
- allura_base.log.info('Extracted talk for page %s (%s)'
- % (page_id, page['title']))
-
- allura_base.log.info('No talk for page %s (%s)'
- % (page_id, page['title']))
-
- def extract_attachments(self, page):
- page_id = page['page_id']
- for filepath in self._attachments(page_id):
- self._save_attachment(filepath, 'pages', str(page_id),
- 'attachments')
- allura_base.log.info('Extracted attachments for page %s (%s)'
- % (page_id, page['title']))
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py b/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
deleted file mode 100644
index d7a3ce4..0000000
--- a/ForgeWiki/forgewiki/command/wiki2markdown/loaders.py
+++ /dev/null
@@ -1,191 +0,0 @@
-import os
-import json
-import datetime
-from pylons import tmpl_context as c
-from ming.orm.ormsession import ThreadLocalORMSession
-
-from allura import model as M
-from forgewiki import model as WM
-from forgewiki.converters import mediawiki2markdown
-from forgewiki.converters import mediawiki_internal_links2markdown
-from allura.command import base as allura_base
-from allura.lib import helpers as h
-from allura.lib import utils
-from allura.model.session import artifact_orm_session
-
-
-class MediawikiLoader(object):
- """Load MediaWiki data from json to Allura wiki tool"""
- TIMESTAMP_FMT = '%Y%m%d%H%M%S'
-
- def __init__(self, options):
- self.options = options
- self.nbhd = M.Neighborhood.query.get(name=options.nbhd)
- if not self.nbhd:
- allura_base.log.error("Can't find neighborhood with name %s"
- % options.nbhd)
- exit(2)
- self.project = M.Project.query.get(shortname=options.project,
- neighborhood_id=self.nbhd._id)
- if not self.project:
- allura_base.log.error("Can't find project with shortname %s "
- "and neighborhood_id %s"
- % (options.project, self.nbhd._id))
- exit(2)
-
- self.wiki = self.project.app_instance('wiki')
- if not self.wiki:
- allura_base.log.error("Can't find wiki app in given project")
- exit(2)
-
- h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
- self.project.notifications_disabled = True
-
- def exit(self, status):
- self.project.notifications_disabled = False
- ThreadLocalORMSession.flush_all()
- ThreadLocalORMSession.close_all()
- exit(status)
-
- def load(self):
- artifact_orm_session._get().skip_mod_date = True
- self.load_pages()
- self.project.notifications_disabled = False
- artifact_orm_session._get().skip_mod_date = False
- ThreadLocalORMSession.flush_all()
- ThreadLocalORMSession.close_all()
- allura_base.log.info('Loading wiki done')
-
- def _pages(self):
- """Yield path to page dump directory for next wiki page"""
- pages_dir = os.path.join(self.options.dump_dir, 'pages')
- pages = []
- if not os.path.isdir(pages_dir):
- return
- pages = os.listdir(pages_dir)
- for directory in pages:
- dir_path = os.path.join(pages_dir, directory)
- if os.path.isdir(dir_path):
- yield dir_path
-
- def _history(self, page_dir):
- """Yield page_data for next wiki page in edit history"""
- page_dir = os.path.join(page_dir, 'history')
- if not os.path.isdir(page_dir):
- return
- pages = os.listdir(page_dir)
- pages.sort() # ensure that history in right order
- for page in pages:
- fn = os.path.join(page_dir, page)
- try:
- with open(fn, 'r') as pages_file:
- page_data = json.load(pages_file)
- except IOError, e:
- allura_base.log.error("Can't open file: %s" % str(e))
- self.exit(2)
- except ValueError, e:
- allura_base.log.error("Can't load data from file %s: %s"
- % (fn, str(e)))
- self.exit(2)
- yield page_data
-
- def _talk(self, page_dir):
- """Return talk data from json dump"""
- filename = os.path.join(page_dir, 'discussion.json')
- if not os.path.isfile(filename):
- return
- try:
- with open(filename, 'r') as talk_file:
- talk_data = json.load(talk_file)
- except IOError, e:
- allura_base.log.error("Can't open file: %s" % str(e))
- self.exit(2)
- except ValueError, e:
- allura_base.log.error("Can't load data from file %s: %s"
- % (filename, str(e)))
- self.exit(2)
- return talk_data
-
- def _attachments(self, page_dir):
- """Yield (filename, full path) to next attachment for given page."""
- attachments_dir = os.path.join(page_dir, 'attachments')
- if not os.path.isdir(attachments_dir):
- return
- attachments = os.listdir(attachments_dir)
- for filename in attachments:
- yield filename, os.path.join(attachments_dir, filename)
-
- def load_pages(self):
- """Load pages with edit history from json to Allura wiki tool"""
- allura_base.log.info('Loading pages into allura...')
- for page_dir in self._pages():
- for page in self._history(page_dir):
- p = WM.Page.upsert(page['title'])
- p.viewable_by = ['all']
- p.text = mediawiki_internal_links2markdown(
- mediawiki2markdown(page['text']),
- page['title'])
- timestamp = datetime.datetime.strptime(page['timestamp'],
- self.TIMESTAMP_FMT)
- p.mod_date = timestamp
- c.user = (M.User.query.get(username=page['username'].lower())
- or M.User.anonymous())
- ss = p.commit()
- ss.mod_date = ss.timestamp = timestamp
-
- # set home to main page
- if page['title'] == 'Main_Page':
- gl = WM.Globals.query.get(app_config_id=self.wiki.config._id)
- if gl is not None:
- gl.root = page['title']
- allura_base.log.info('Loaded history of page %s (%s)'
- % (page['page_id'], page['title']))
-
- self.load_talk(page_dir, page['title'])
- self.load_attachments(page_dir, page['title'])
-
- def load_talk(self, page_dir, page_title):
- """Load talk for page.
-
- page_dir - path to directory with page dump.
- page_title - page title in Allura Wiki
- """
- talk_data = self._talk(page_dir)
- if not talk_data:
- return
- text = mediawiki2markdown(talk_data['text'])
- page = WM.Page.query.get(app_config_id=self.wiki.config._id,
- title=page_title)
- if not page:
- return
- thread = M.Thread.query.get(ref_id=page.index_id())
- if not thread:
- return
- timestamp = datetime.datetime.strptime(talk_data['timestamp'],
- self.TIMESTAMP_FMT)
- c.user = (M.User.query.get(username=talk_data['username'].lower())
- or M.User.anonymous())
- thread.add_post(
- text=text,
- discussion_id=thread.discussion_id,
- thread_id=thread._id,
- timestamp=timestamp,
- ignore_security=True)
- allura_base.log.info('Loaded talk for page %s' % page_title)
-
- def load_attachments(self, page_dir, page_title):
- """Load attachments for page.
-
- page_dir - path to directory with page dump.
- """
- page = WM.Page.query.get(app_config_id=self.wiki.config._id,
- title=page_title)
- for filename, path in self._attachments(page_dir):
- try:
- with open(path) as fp:
- page.attach(filename, fp,
- content_type=utils.guess_mime_type(filename))
- except IOError, e:
- allura_base.log.error("Can't open file: %s" % str(e))
- self.exit(2)
- allura_base.log.info('Loaded attachments for page %s.' % page_title)
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/scripts/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/__init__.py b/ForgeWiki/forgewiki/scripts/__init__.py
new file mode 100644
index 0000000..e69de29
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py
new file mode 100644
index 0000000..2fe4b24
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/__init__.py
@@ -0,0 +1 @@
+from wiki2markdown import Wiki2Markdown
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py
new file mode 100644
index 0000000..8d95317
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/extractors.py
@@ -0,0 +1,171 @@
+import logging
+import os
+import shutil
+import json
+import hashlib
+
+log = logging.getLogger(__name__)
+
+
+class MediawikiExtractor(object):
+ """Base class for MediaWiki data provider"""
+
+ def __init__(self, options):
+ self.options = options
+ if os.path.exists(self.options.dump_dir):
+ # clear dump_dir before extraction (there may be an old data)
+ shutil.rmtree(self.options.dump_dir)
+ os.makedirs(self.options.dump_dir)
+
+ def extract(self):
+ """Extract pages with history, attachments, talk-pages, etc"""
+ raise NotImplementedError("subclass must override this")
+
+
+class MySQLExtractor(MediawikiExtractor):
+ """Extract MediaWiki data to json.
+
+ Use connection to MySQL database as a data source.
+ """
+
+ def __init__(self, options):
+ super(MySQLExtractor, self).__init__(options)
+ self._connection = None
+ self.db_options = {
+ 'host': self.options.host or 'localhost',
+ 'user': self.options.user,
+ 'passwd': self.options.password,
+ 'db': self.options.db_name,
+ 'port': self.options.port or 3306
+ }
+
+ def connection(self):
+ try:
+ import MySQLdb
+ except ImportError:
+ raise ImportError('GPL library MySQL-python is required for this operation')
+
+ if not self._connection:
+ self._connection = MySQLdb.connect(**self.db_options)
+ return self._connection
+
+ def _save(self, content, *paths):
+ """Save json to file in local filesystem"""
+ out_file = os.path.join(self.options.dump_dir, *paths)
+ if not os.path.exists(os.path.dirname(out_file)):
+ os.makedirs(os.path.dirname(out_file))
+ with open(out_file, 'w') as out:
+ out.write(content.encode('utf-8'))
+
+ def _save_attachment(self, filepath, *paths):
+ """Save attachment in dump directory.
+
+ Copy from mediawiki dump directory to our internal dump directory.
+
+ args:
+ filepath - path to attachment in mediawiki dump.
+ *paths - path to internal dump directory.
+ """
+ out_dir = os.path.join(self.options.dump_dir, *paths)
+ if not os.path.exists(out_dir):
+ os.makedirs(out_dir)
+ shutil.copy(filepath, out_dir)
+
+ def _pages(self):
+ """Yield page_data for next wiki page"""
+ c = self.connection().cursor()
+ c.execute('select page.page_id, page.page_title '
+ 'from page where page.page_namespace = 0')
+ for row in c:
+ _id, title = row
+ page_data = {
+ 'page_id': _id,
+ 'title': title,
+ }
+ yield page_data
+
+ def _history(self, page_id):
+ """Yield page_data for next revision of wiki page"""
+ c = self.connection().cursor()
+ c.execute('select revision.rev_timestamp, text.old_text, '
+ 'revision.rev_user_text '
+ 'from revision '
+ 'left join text on revision.rev_text_id = text.old_id '
+ 'where revision.rev_page = %s', page_id)
+ for row in c:
+ timestamp, text, username = row
+ page_data = {
+ 'timestamp': timestamp,
+ 'text': text or '',
+ 'username': username
+ }
+ yield page_data
+
+ def _talk(self, page_title):
+ """Return page_data for talk page with `page_title` title"""
+ c = self.connection().cursor()
+ query_attrs = (page_title, 1) # page_namespace == 1 - talk pages
+ c.execute('select text.old_text, revision.rev_timestamp, '
+ 'revision.rev_user_text '
+ 'from page '
+ 'left join revision on revision.rev_id = page.page_latest '
+ 'left join text on text.old_id = revision.rev_text_id '
+ 'where page.page_title = %s and page.page_namespace = %s '
+ 'limit 1', query_attrs)
+
+ row = c.fetchone()
+ if row:
+ text, timestamp, username = row
+ return {'text': text, 'timestamp': timestamp, 'username': username}
+
+ def _attachments(self, page_id):
+ """Yield path to next file attached to wiki page"""
+ c = self.connection().cursor()
+ c.execute('select il_to from imagelinks '
+ 'where il_from = %s' % page_id)
+ for row in c:
+ name = row[0]
+ # mediawiki stores attachmets in subdirectories
+ # based on md5-hash of filename
+ # so we need to build path to file as follows
+ md5 = hashlib.md5(name).hexdigest()
+ path = os.path.join(self.options.attachments_dir,
+ md5[:1], md5[:2], name)
+ if os.path.isfile(path):
+ yield path
+
+ def extract(self):
+ self.extract_pages()
+
+ def extract_pages(self):
+ log.info('Extracting pages...')
+ for page in self._pages():
+ self.extract_history(page)
+ self.extract_talk(page)
+ self.extract_attachments(page)
+ log.info('Extracting pages done')
+
+ def extract_history(self, page):
+ page_id = page['page_id']
+ for page_data in self._history(page_id):
+ page_data.update(page)
+ self._save(json.dumps(page_data), 'pages', str(page_id),
+ 'history', str(page_data['timestamp']) + '.json')
+ log.info('Extracted history for page %s (%s)', page_id, page['title'])
+
+ def extract_talk(self, page):
+ page_id = page['page_id']
+ talk_page_data = self._talk(page['title'])
+ if talk_page_data:
+ self._save(json.dumps(talk_page_data), 'pages', str(page_id),
+ 'discussion.json')
+ log.info('Extracted talk for page %s (%s)', page_id, page['title'])
+ else:
+ log.info('No talk for page %s (%s)', page_id, page['title'])
+
+ def extract_attachments(self, page):
+ page_id = page['page_id']
+ for filepath in self._attachments(page_id):
+ self._save_attachment(filepath, 'pages', str(page_id),
+ 'attachments')
+ log.info('Extracted attachments for page %s (%s)', page_id, page['title'])
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py
new file mode 100644
index 0000000..00487a1
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/loaders.py
@@ -0,0 +1,182 @@
+import logging
+import os
+import json
+import datetime
+from pylons import tmpl_context as c
+from ming.orm.ormsession import ThreadLocalORMSession
+
+from allura import model as M
+from forgewiki import model as WM
+from forgewiki.converters import mediawiki2markdown
+from forgewiki.converters import mediawiki_internal_links2markdown
+from allura.lib import helpers as h
+from allura.lib import utils
+from allura.model.session import artifact_orm_session
+
+log = logging.getLogger(__name__)
+
+
+class MediawikiLoader(object):
+ """Load MediaWiki data from json to Allura wiki tool"""
+ TIMESTAMP_FMT = '%Y%m%d%H%M%S'
+
+ def __init__(self, options):
+ self.options = options
+ self.nbhd = M.Neighborhood.query.get(name=options.nbhd)
+ if not self.nbhd:
+ raise ValueError("Can't find neighborhood with name %s"
+ % options.nbhd)
+ self.project = M.Project.query.get(shortname=options.project,
+ neighborhood_id=self.nbhd._id)
+ if not self.project:
+ raise ValueError("Can't find project with shortname %s "
+ "and neighborhood_id %s"
+ % (options.project, self.nbhd._id))
+
+ self.wiki = self.project.app_instance('wiki')
+ if not self.wiki:
+ raise ValueError("Can't find wiki app in given project")
+
+ h.set_context(self.project.shortname, 'wiki', neighborhood=self.nbhd)
+
+ def load(self):
+ try:
+ self.project.notifications_disabled = True
+ artifact_orm_session._get().skip_mod_date = True
+ self.load_pages()
+ ThreadLocalORMSession.flush_all()
+ log.info('Loading wiki done')
+ finally:
+ self.project.notifications_disabled = False
+ artifact_orm_session._get().skip_mod_date = False
+
+ def _pages(self):
+ """Yield path to page dump directory for next wiki page"""
+ pages_dir = os.path.join(self.options.dump_dir, 'pages')
+ pages = []
+ if not os.path.isdir(pages_dir):
+ return
+ pages = os.listdir(pages_dir)
+ for directory in pages:
+ dir_path = os.path.join(pages_dir, directory)
+ if os.path.isdir(dir_path):
+ yield dir_path
+
+ def _history(self, page_dir):
+ """Yield page_data for next wiki page in edit history"""
+ page_dir = os.path.join(page_dir, 'history')
+ if not os.path.isdir(page_dir):
+ return
+ pages = os.listdir(page_dir)
+ pages.sort() # ensure that history in right order
+ for page in pages:
+ fn = os.path.join(page_dir, page)
+ try:
+ with open(fn, 'r') as pages_file:
+ page_data = json.load(pages_file)
+ except IOError, e:
+ log.error("Can't open file: %s", str(e))
+ raise
+ except ValueError, e:
+ log.error("Can't load data from file %s: %s", fn, str(e))
+ raise
+ yield page_data
+
+ def _talk(self, page_dir):
+ """Return talk data from json dump"""
+ filename = os.path.join(page_dir, 'discussion.json')
+ if not os.path.isfile(filename):
+ return
+ try:
+ with open(filename, 'r') as talk_file:
+ talk_data = json.load(talk_file)
+ except IOError, e:
+ log.error("Can't open file: %s", str(e))
+ raise
+ except ValueError, e:
+ log.error("Can't load data from file %s: %s", filename, str(e))
+ raise
+ return talk_data
+
+ def _attachments(self, page_dir):
+ """Yield (filename, full path) to next attachment for given page."""
+ attachments_dir = os.path.join(page_dir, 'attachments')
+ if not os.path.isdir(attachments_dir):
+ return
+ attachments = os.listdir(attachments_dir)
+ for filename in attachments:
+ yield filename, os.path.join(attachments_dir, filename)
+
+ def load_pages(self):
+ """Load pages with edit history from json to Allura wiki tool"""
+ log.info('Loading pages into allura...')
+ for page_dir in self._pages():
+ for page in self._history(page_dir):
+ p = WM.Page.upsert(page['title'])
+ p.viewable_by = ['all']
+ p.text = mediawiki_internal_links2markdown(
+ mediawiki2markdown(page['text']),
+ page['title'])
+ timestamp = datetime.datetime.strptime(page['timestamp'],
+ self.TIMESTAMP_FMT)
+ p.mod_date = timestamp
+ c.user = (M.User.query.get(username=page['username'].lower())
+ or M.User.anonymous())
+ ss = p.commit()
+ ss.mod_date = ss.timestamp = timestamp
+
+ # set home to main page
+ if page['title'] == 'Main_Page':
+ gl = WM.Globals.query.get(app_config_id=self.wiki.config._id)
+ if gl is not None:
+ gl.root = page['title']
+ log.info('Loaded history of page %s (%s)', page['page_id'], page['title'])
+
+ self.load_talk(page_dir, page['title'])
+ self.load_attachments(page_dir, page['title'])
+
+ def load_talk(self, page_dir, page_title):
+ """Load talk for page.
+
+ page_dir - path to directory with page dump.
+ page_title - page title in Allura Wiki
+ """
+ talk_data = self._talk(page_dir)
+ if not talk_data:
+ return
+ text = mediawiki2markdown(talk_data['text'])
+ page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+ title=page_title)
+ if not page:
+ return
+ thread = M.Thread.query.get(ref_id=page.index_id())
+ if not thread:
+ return
+ timestamp = datetime.datetime.strptime(talk_data['timestamp'],
+ self.TIMESTAMP_FMT)
+ c.user = (M.User.query.get(username=talk_data['username'].lower())
+ or M.User.anonymous())
+ thread.add_post(
+ text=text,
+ discussion_id=thread.discussion_id,
+ thread_id=thread._id,
+ timestamp=timestamp,
+ ignore_security=True)
+ log.info('Loaded talk for page %s', page_title)
+
+ def load_attachments(self, page_dir, page_title):
+ """Load attachments for page.
+
+ page_dir - path to directory with page dump.
+ """
+ page = WM.Page.query.get(app_config_id=self.wiki.config._id,
+ title=page_title)
+ for filename, path in self._attachments(page_dir):
+ try:
+ with open(path) as fp:
+ page.attach(filename, fp,
+ content_type=utils.guess_mime_type(filename))
+ except IOError, e:
+ log.error("Can't open file: %s", str(e))
+ raise
+ log.info('Loaded attachments for page %s.', page_title)
http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/4d2f710a/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py b/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py
new file mode 100644
index 0000000..e837f45f
--- /dev/null
+++ b/ForgeWiki/forgewiki/scripts/wiki2markdown/wiki2markdown.py
@@ -0,0 +1,109 @@
+import argparse
+import logging
+import shutil
+import tempfile
+
+from tg import config
+
+from allura.lib import helpers as h
+from allura.scripts import ScriptTask
+
+from forgewiki.scripts.wiki2markdown.extractors import MySQLExtractor
+from forgewiki.scripts.wiki2markdown.loaders import MediawikiLoader
+
+log = logging.getLogger(__name__)
+
+
+class Wiki2Markdown(ScriptTask):
+ """Import MediaWiki to Allura Wiki tool"""
+ @classmethod
+ def parser(cls):
+ parser = argparse.ArgumentParser(description='Import wiki from'
+ 'mediawiki-dump to allura wiki')
+ parser.add_argument('-e', '--extract-only', action='store_true',
+ dest='extract',
+ help='Store data from the mediawiki-dump '
+ 'on the local filesystem; not load into Allura')
+ parser.add_argument('-l', '--load-only', action='store_true', dest='load',
+ help='Load into Allura previously-extracted data')
+ parser.add_argument('-d', '--dump-dir', dest='dump_dir', default='',
+ help='Directory for dump files')
+ parser.add_argument('-n', '--neighborhood', dest='nbhd', default='',
+ help='Neighborhood name to load data')
+ parser.add_argument('-p', '--project', dest='project', default='',
+ help='Project shortname to load data into')
+ parser.add_argument('-a', '--attachments-dir', dest='attachments_dir',
+ help='Path to directory with mediawiki attachments dump',
+ default='')
+ parser.add_argument('--db_config_prefix', dest='db_config_prefix',
+ help='Key prefix (e.g. "legacy.") in ini file to '
+ 'use instead of commandline db params')
+ parser.add_argument('-s', '--source', dest='source', default='mysql',
+ help='Database type to extract from (only mysql for now)')
+ parser.add_argument('--db_name', dest='db_name', default='mediawiki',
+ help='Database name')
+ parser.add_argument('--host', dest='host', default='localhost',
+ help='Database host')
+ parser.add_argument('--port', dest='port', type=int, default=0,
+ help='Database port')
+ parser.add_argument('--user', dest='user', default='',
+ help='User for database connection')
+ parser.add_argument('--password', dest='password', default='',
+ help='Password for database connection')
+ parser.add_argument('--keep-dumps', action='store_true', dest='keep_dumps',
+ help='Leave dump files on disk after run')
+ return parser
+
+ @classmethod
+ def execute(cls, options):
+ options = cls.handle_options(options)
+
+ try:
+ if options.extract:
+ MySQLExtractor(options).extract()
+ if options.load:
+ MediawikiLoader(options).load()
+ finally:
+ if not options.keep_dumps:
+ shutil.rmtree(options.dump_dir)
+
+ @classmethod
+ def handle_options(cls, options):
+ if not options.extract and not options.load:
+ # if action doesn't specified - do both
+ options.extract = True
+ options.load = True
+
+ if not options.dump_dir:
+ if options.load and not options.extract:
+ raise ValueError('You must specify directory containing dump files')
+ else:
+ options.dump_dir = tempfile.mkdtemp()
+ log.info("Writing temp files to %s", options.dump_dir)
+
+ if options.load and (not options.project or not options.nbhd):
+ raise ValueError('You must specify neighborhood and project '
+ 'to load data')
+
+ if options.extract:
+ if options.db_config_prefix:
+ for k, v in h.config_with_prefix(config, options.db_config_prefix).iteritems():
+ if k == 'port':
+ v = int(v)
+ setattr(options, k, v)
+
+ if options.source == 'mysql':
+ pass
+ elif options.source in ('sqlite', 'postgres', 'sql-dump'):
+ raise ValueError('This source not implemented yet. Only mysql for now')
+ else:
+ raise ValueError('You must specify a valid data source')
+
+ if not options.attachments_dir:
+ raise ValueError('You must specify path to directory with mediawiki attachmets dump.')
+
+ return options
+
+
+if __name__ == '__main__':
+ Wiki2Markdown.main()