- merge python code v2

http://www.dokuwiki.org/tips:moinmoin2doku?rev=1265494867#another_python_script
2011-02-06 19:05:22 +02:00 · 2011-02-06 19:05:22 +02:00 · c0629ccb89
commit c0629ccb89
parent 5f64d725ee
1 changed files with 138 additions and 74 deletions
--- a/moin2doku.py
+++ b/moin2doku.py
@ -1,6 +1,29 @@
 #!/usr/bin/python
-import sys, os, os.path
-import re
+#
+# moin2doku.py
+#
+# A script for converting MoinMoin version 1.3+ wiki data to DokuWiki format.
+# Call with the name of the directory containing the MoinMoin pages and that
+# of the directory to receive the DokuWiki pages on the command line:
+#
+# python moin2doku.py ./moin/data/pages/ ./doku/
+#
+# then move the doku pages to e.g. /var/www/MyWikiName/data/pages/,
+# move the media files to e.g. /var/www/MyWikiName/data/media/,
+# set ownership: chown -R www-data:www-data /var/www/MyWikiName/data/pages/*
+# chown -R www-data:www-data /var/www/MyWikiName/data/media/*
+#
+# This script doesn't do all the work, and some of the work it does is
+# wrong. For instance attachment links end up with the trailing "|}}"
+# on the line following the link. This works, but doesn't look good.
+# The script interprets a "/" in a pagename as a namespace delimiter and
+# creates and fills namespace subdirectories accordingly.
+#
+# version 0.1  02.2010  Slim Gaillard, based on the "extended python"
+#                       convert.py script here:
+#                       http://www.dokuwiki.org/tips:moinmoin2doku
+#
+import sys, os, os.path, re, pdb
 from os import listdir
 from os.path import isdir, basename

@ -13,14 +36,16 @@ def check_dirs(moin_pages_dir, output_dir):
        print >> sys.stderr, "Output directory doesn't exist!"
        sys.exit(1)

-def get_page_names(moin_pages_dir):
+def get_path_names(moin_pages_dir):
    items = listdir(moin_pages_dir)
-    pages = []
+    pathnames = []
+
    for item in items:
        item = os.path.join(moin_pages_dir, item)
        if isdir(item):
-            pages.append(item)
-    return pages
+            pathnames.append(item)
+
+    return pathnames

 def get_current_revision(page_dir):
    rev_dir = os.path.join(page_dir, 'revisions')
@ -34,8 +59,10 @@ def copy_attachments(page_dir, attachment_dir):
  dir = os.path.join(page_dir,'attachments')
  if isdir(dir):
    attachments = listdir(dir)
+    #pdb.set_trace()
    for attachment in attachments:
-      os.system ('cp "' + dir +'/' + attachment + '" "' + attachment_dir +'"')
+      cmd_string = 'cp "' + dir +'/' + attachment + '" "' + attachment_dir + attachment.lower() + '"'
+      os.system ( cmd_string )

 def convert_page(page, file):
    namespace = ':'
@ -43,30 +70,32 @@ def convert_page(page, file):
      namespace += file[i] + ':'

    regexp = (
-        ('\[\[TableOfContents.*\]\]', ''),            # remove
+        ('\[\[TableOfContents.*\]\]', ''),          # remove
        ('\[\[BR\]\]$', ''),                        # newline at end of line - remove
        ('\[\[BR\]\]', '\n'),                       # newline
        ('#pragma section-numbers off', ''),        # remove
        ('^##.*?\\n', ''),                          # remove
-        ('\[:(.*):',  '[[\\1]] '),                 # internal link
-        ('\[\[(.*)/(.*)\]\]',  '[[\\1:\\2]]'),
-        ('(\[\[.*\]\]).*\]', '\\1'),
-        ('\[(http.*) .*\]', '[[\\1]]'),                  # web link
+        ('\["', '[['),                              # internal link open
+        ('"\]', ']]'),                              # internal link close
+        #('\[:(.*):',  '[[\\1]] '),                 # original internal link expressions
+        #('\[\[(.*)/(.*)\]\]',  '[[\\1:\\2]]'),
+        #('(\[\[.*\]\]).*\]', '\\1'),
+        ('\[(http.*) .*\]', '[[\\1]]'),             # web link
        ('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
-        ('\{{3}', '<>code>'),                       # code open
-        ('\}{3}', '<>/code>'),                      # code close
+        ('\{{3}', '<>code>'),                        # code open
+        ('\}{3}', '<>/code>'),                       # code close
        ('^\s\s\s\s\*', '        *'),
        ('^\s\s\s\*', '      *'),
        ('^\s\s\*', '    *'),
-        ('^\s\*', '  *'),                           # lists must have not only but 2 whitespaces before *
+        ('^\s\*', '  *'),                           # lists must have 2 whitespaces before the asterisk
        ('^\s\s\s\s1\.', '      -'),
        ('^\s\s1\.', '    -'),
        ('^\s1\.', '  -'),
        ('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'),           # heading 5
-        ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'),        # heading 4
+        ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'),         # heading 4
        ('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'),       # heading 3
        ('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'),     # heading 2
-        ('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'),   # heading 1
+        ('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'),    # heading 1
        ('=-', '='),
        ('\|{2}', '|'),                             # table separator
        ('\'{5}(.*)\'{5}', '**//\\1//**'),          # bold and italic
@ -93,70 +122,105 @@ def print_parameter_error():
    print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
    sys.exit(1)

-if __name__ == '__main__':
-    if len(sys.argv) > 1:
-        if sys.argv[1] in ('-h', '--help'):
-            print_help()
-        elif len(sys.argv) > 2:
-            moin_pages_dir = sys.argv[1]
-            output_dir = sys.argv[2]
-        else:
-            print_parameter_error()
+def fix_name( filename ):
+    filename = filename.lower()
+    filename = filename.replace('(2d)', '-')          # hyphen
+    filename = filename.replace('(20)', '_')          # space->underscore
+    filename = filename.replace('(2e)', '_')          # decimal point->underscore
+    filename = filename.replace('(29)', '_')          # )->underscore
+    filename = filename.replace('(28)', '_')          # (->underscore
+    filename = filename.replace('.', '_')             # decimal point->underscore
+    filename = filename.replace('(2c20)', '_')        # comma + space->underscore
+    filename = filename.replace('(2028)', '_')        # space + (->underscore
+    filename = filename.replace('(2920)', '_')        # ) + space->underscore
+    filename = filename.replace('(2220)', 'inch_')    # " + space->inch + underscore
+    filename = filename.replace('(3a20)', '_')        # : + space->underscore
+    filename = filename.replace('(202827)', '_')      # space+(+'->underscore
+    filename = filename.replace('(2720)', '_')        # '+ space->underscore
+    filename = filename.replace('(c3bc)', 'ue')       # umlaut
+    filename = filename.replace('(c384)', 'Ae')       # umlaut
+    filename = filename.replace('(c3a4)', 'ae')       # umlaut
+    filename = filename.replace('(c3b6)', 'oe')       # umlaut
+    return filename
+
+#
+# "main" starts here
+#
+if len(sys.argv) > 1:
+    if sys.argv[1] in ('-h', '--help'):
+        print_help()
+    elif len(sys.argv) > 2:
+        moin_pages_dir = sys.argv[1]
+        output_dir = sys.argv[2]
    else:
        print_parameter_error()
+else:
+    print_parameter_error()

-    check_dirs(moin_pages_dir, output_dir)
+check_dirs(moin_pages_dir, output_dir)

-    print 'Input dir is: %s.' % moin_pages_dir
-    print 'Output dir is: %s.' % output_dir
-    print
+print 'Input dir is: %s.' % moin_pages_dir
+print 'Output dir is: %s.' % output_dir

-    pages = get_page_names(moin_pages_dir)
-    for page in pages:
-        curr_rev = get_current_revision(page)
-        if os.path.exists(curr_rev):
-            page_name = basename(page).lower()
-            curr_rev_desc = file(curr_rev, 'r')
-            curr_rev_content = curr_rev_desc.readlines()
-            curr_rev_desc.close()
+pathnames = get_path_names(moin_pages_dir)

-            if 'moineditorbackup' not in page_name: #dont convert backups
-              page_name = page_name.replace('(2d)', '-')
-              page_name = page_name.replace('(c3bc)', 'ue')
-              page_name = page_name.replace('(c384)', 'Ae')
-              page_name = page_name.replace('(c3a4)', 'ae')
-              page_name = page_name.replace('(c3b6)', 'oe')
+for pathname in pathnames:
+    #pdb.set_trace() # start debugging here

-              split = page_name.split('(2f)') # namespaces
-              count = len(split)
-              dateiname = split[-1]
+    curr_rev = get_current_revision( pathname )
+    if not os.path.exists( curr_rev ) : continue

-              dir = output_dir
-              attachment_dir = output_dir + '../media/'
-              if count == 1:
-                dir += 'unsorted'
-                if not isdir (dir):
-                  os.mkdir(dir)
-                attachment_dir += 'unsorted/'
-                if not isdir (attachment_dir):
-                  os.mkdir(attachment_dir)
-              for i in range(0, count - 1):
-                dir += split[i] + '/'
-                if not isdir (dir):
-                  os.mkdir(dir)
-                attachment_dir += split[i] + '/'
-                if not isdir (attachment_dir):
-                  os.mkdir(attachment_dir)
-              if count == 1:
-                str = 'unsorted/' + page_name
-                split = str.split('/')
-                curr_rev_content = convert_page(curr_rev_content, split)
-              else:
-                curr_rev_content = convert_page(curr_rev_content, split)
+    page_name = basename(pathname)
+    if page_name.count('MoinEditorBackup') > 0 : continue # don't convert backups

-              out_file = os.path.join(dir, dateiname + '.txt')
-              out_desc = file(out_file, 'w')
-              out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
+    curr_rev_desc = file(curr_rev, 'r')
+    curr_rev_content = curr_rev_desc.readlines()
+    curr_rev_desc.close()

-              out_desc.close()
-              copy_attachments(page, attachment_dir)
+    page_name = fix_name( page_name )
+
+    split = page_name.split('(2f)') # namespaces
+
+    count = len(split)
+
+    dateiname = split[-1]
+
+    dir = output_dir
+    # changed from attachment_dir = output_dir + '../media/':
+    attachment_dir = output_dir + 'media/'
+    if not isdir (attachment_dir):
+      os.mkdir(attachment_dir)
+
+    if count == 1:
+      dir += 'unsorted'
+      if not isdir (dir):
+        os.mkdir(dir)
+
+      attachment_dir += 'unsorted/'
+      if not isdir (attachment_dir):
+        os.mkdir(attachment_dir)
+
+    for i in range(0, count - 1):
+
+      dir += split[i] + '/'
+      if not isdir (dir):
+        os.mkdir(dir)
+
+      attachment_dir += split[i] + '/'
+      if not isdir (attachment_dir):
+        os.mkdir(attachment_dir)
+
+    if count == 1:
+      str = 'unsorted/' + page_name
+      split = str.split('/')
+      curr_rev_content = convert_page(curr_rev_content, split)
+    else:
+      curr_rev_content = convert_page(curr_rev_content, split)
+
+    out_file = os.path.join(dir, dateiname + '.txt')
+    out_desc = file(out_file, 'w')
+    out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
+    out_desc.close()
+
+    # pdb.set_trace() # start debugging here
+    copy_attachments(pathname, attachment_dir)