use use installed MoinMoin and DokuWiki to get accurate results

2011-02-10 10:49:37 +02:00 · 2011-02-10 10:49:37 +02:00 · d339053bf1
commit d339053bf1
parent 52428be665
1 changed files with 46 additions and 66 deletions
--- a/moin2doku.py
+++ b/moin2doku.py
@ -8,12 +8,13 @@
 # Call with the name of the directory containing the MoinMoin pages and that
 # of the directory to receive the DokuWiki pages on the command line:
 #
-# python moin2doku.py ./moin/data/pages/ ./doku/
+# You need to run this on host where MoinMoin is configured and DokuWiki is
 # configured, it will use current configuration from both wikis.
 #
-# then move the doku pages to e.g. /var/www/MyWikiName/data/pages/,
+# python moin2doku.py ./moin/data/pages/
-# move the media files to e.g. /var/www/MyWikiName/data/media/,
+#
-# set ownership: chown -R www-data:www-data /var/www/MyWikiName/data/pages/*
+# set ownership: chown -R www-data:www-data /var/lib/dokuwiki/pages/*
-# chown -R www-data:www-data /var/www/MyWikiName/data/media/*
+# chown -R www-data:www-data /var/lib/dokuwiki/media/*
 #
 # This script doesn't do all the work, and some of the work it does is
 # wrong. For instance attachment links end up with the trailing "|}}"
@ -32,6 +33,7 @@ import getopt
 from shutil import copyfile, copystat
 from os import listdir
 from os.path import isdir, basename
 from doku import DokuWiki
 def check_dirs(moin_pages_dir, output_dir):
  if not isdir(moin_pages_dir):
@ -68,10 +70,10 @@ def writefile(filename, content, overwrite=False):
  f.writelines([it.rstrip() + '\n' for it in content if it])
  f.close()
-def get_current_revision(page_dir):
+def get_current_revision(pagedir):
-  rev_dir = os.path.join(page_dir, 'revisions')
+  rev_dir = os.path.join(pagedir, 'revisions')
  # try "current" file first
-  f = os.path.join(page_dir, 'current')
+  f = os.path.join(pagedir, 'current')
  if os.path.exists(f):
    rev = readfile(f)[0].rstrip()
    try:
@ -85,7 +87,7 @@ def get_current_revision(page_dir):
    revisions.sort()
    rev = revisions[-1]
-  print "%s rev: %s" % (page_dir, rev)
+  print "%s rev: %s" % (pagedir, rev)
  f = os.path.join(rev_dir, rev)
  if not os.path.exists(f):
    # deleted pages have '00000002' in current, and no existing file
@ -93,28 +95,34 @@ def get_current_revision(page_dir):
  return f
-def copy_attachments(page_dir, attachment_dir):
+# pagedir = MoinMoin page dir
-  dir = os.path.join(page_dir, 'attachments')
+# ns = DokuWiki namespace where attachments to copy
 def copy_attachments(pagedir, ns):
  dir = os.path.join(pagedir, 'attachments')
  if not isdir(dir):
    return
  attachment_dir = dw.mediaFn(ns)
  if not isdir(attachment_dir):
    os.makedirs(attachment_dir);
  attachments = listdir(dir)
  for attachment in attachments:
    src = os.path.join(dir, attachment)
-    dst = os.path.join(attachment_dir, attachment.lower())
+    dst = dw.mediaFn(dw.cleanID("%s/%s" % (ns, attachment)))
    copyfile(src, dst)
    copystat(src, dst)
-def convert_markup(content, filename):
+# convert page markup
 # pagename: name of current page (MoinMoin name)
 # content: page content (MoinMoin markup)
 def convert_markup(pagename, content):
  """
  convert page markup
  """
  namespace = ':'
-  for i in range(0, len(filename) - 1):
+#  for i in range(0, len(filename) - 1):
-    namespace += filename[i] + ':'
+#    namespace += filename[i] + ':'
  # http://www.pld-linux.org/SyntaxReference
  regexp = (
@ -155,7 +163,7 @@ def convert_markup(content, filename):
  # web link with title
  ('\[((?:http|https|file)[^\s]+)\s+(.+?)\]', '[[\\1|\\2]]'),
-  ('\["/(.*)"\]', '[['+filename[-1]+':\\1]]'),
+#  ('\["/(.*)"\]', '[['+filename[-1]+':\\1]]'),
  # code blocks
  # open and language
@ -203,69 +211,39 @@ def print_help():
  print "-f FILE - convert signle file"
  sys.exit(0)
-def unquote(filename):
+# return unicode encoded wikiname
-  filename = filename.lower()
+# input is a dir from moinmoin pages/ dir
-  filename = filename.replace('(2d)', '-')          # hyphen
+def wikiname(filename):
-  filename = filename.replace('(20)', '_')          # space->underscore
+  from MoinMoin import wikiutil
-  filename = filename.replace('(2e)', '_')          # decimal point->underscore
+  return wikiutil.unquoteWikiname(basename(filename))
  filename = filename.replace('(29)', '_')          # )->underscore
  filename = filename.replace('(28)', '_')          # (->underscore
  filename = filename.replace('.', '_')             # decimal point->underscore
  filename = filename.replace('(2c20)', '_')        # comma + space->underscore
  filename = filename.replace('(2028)', '_')        # space + (->underscore
  filename = filename.replace('(2920)', '_')        # ) + space->underscore
  filename = filename.replace('(2220)', 'inch_')    # " + space->inch + underscore
  filename = filename.replace('(3a20)', '_')        # : + space->underscore
  filename = filename.replace('(202827)', '_')      # space+(+'->underscore
  filename = filename.replace('(2720)', '_')        # '+ space->underscore
  filename = filename.replace('(c3bc)', 'ue')       # umlaut
  filename = filename.replace('(c384)', 'Ae')       # umlaut
  filename = filename.replace('(c3a4)', 'ae')       # umlaut
  filename = filename.replace('(c3b6)', 'oe')       # umlaut
  return filename
-def convertfile(pathname, overwrite = False):
+def convertfile(pagedir, overwrite = False):
-  print "-> %s" % pathname
+  pagedir  = os.path.abspath(pagedir)
-  curr_rev = get_current_revision(pathname)
+  print "-> %s" % pagedir
  curr_rev = get_current_revision(pagedir)
  if curr_rev == None:
-    print "SKIP %s: no current revision" % pathname
+    print "SKIP %s: no current revision" % pagedir
    return
  if not os.path.exists(curr_rev):
    print "SKIP %s: filename missing" % curr_rev
    return
-  page_name = basename(pathname)
+  pagename = wikiname(pagedir)
-  if page_name.count('MoinEditorBackup') > 0:
+  print "pagename: [%s]" % pagename
-    print "SKIP %s: skip backups" % pathname
+
  if pagename.count('MoinEditorBackup') > 0:
    print "SKIP %s: skip backups" % pagedir
    return
  content = readfile(curr_rev)
-
+  content = convert_markup(pagename, content)
-  page_name = unquote(page_name)
+  out_file = dw.wikiFn(pagename)
-  print "dokuname: %s" % page_name
+  print "dokuname: [%s]" % out_file
 # split by namespace separator
  ns = page_name.split('(2f)')
  count = len(ns)
  id = ns[-1]
  dir = output_dir
  attachment_dir = os.path.join(output_dir, 'media')
  # root namespace files go to "unsorted"
  if count == 1:
    ns.insert(0, 'unsorted')
  for p in ns[:-1]:
    dir = os.path.join(dir, p);
    attachment_dir = os.path.join(attachment_dir, p);
  content = convert_markup(content, ns)
  out_file = os.path.join(dir, id + '.txt')
  writefile(out_file, content, overwrite = overwrite)
-  copy_attachments(pathname, attachment_dir)
+  ns = dw.getNS(dw.cleanID(pagename))
  copy_attachments(pagedir, ns)
  return 1
@ -299,6 +277,8 @@ check_dirs(moin_pages_dir, output_dir)
 print 'Input dir is: %s.' % moin_pages_dir
 print 'Output dir is: %s.' % output_dir
 dw = DokuWiki()
 if inputfile != None:
  res = convertfile(inputfile, overwrite = overwrite)
 else: