From c0629ccb89865e1fe8fc2707c5b9a43100a8d8f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Elan=20Ruusam=C3=A4e?= Date: Sun, 6 Feb 2011 19:05:22 +0200 Subject: [PATCH] - merge python code v2 http://www.dokuwiki.org/tips:moinmoin2doku?rev=1265494867#another_python_script --- moin2doku.py | 212 +++++++++++++++++++++++++++++++++------------------ 1 file changed, 138 insertions(+), 74 deletions(-) diff --git a/moin2doku.py b/moin2doku.py index db10f20..b26f6d6 100644 --- a/moin2doku.py +++ b/moin2doku.py @@ -1,6 +1,29 @@ #!/usr/bin/python -import sys, os, os.path -import re +# +# moin2doku.py +# +# A script for converting MoinMoin version 1.3+ wiki data to DokuWiki format. +# Call with the name of the directory containing the MoinMoin pages and that +# of the directory to receive the DokuWiki pages on the command line: +# +# python moin2doku.py ./moin/data/pages/ ./doku/ +# +# then move the doku pages to e.g. /var/www/MyWikiName/data/pages/, +# move the media files to e.g. /var/www/MyWikiName/data/media/, +# set ownership: chown -R www-data:www-data /var/www/MyWikiName/data/pages/* +# chown -R www-data:www-data /var/www/MyWikiName/data/media/* +# +# This script doesn't do all the work, and some of the work it does is +# wrong. For instance attachment links end up with the trailing "|}}" +# on the line following the link. This works, but doesn't look good. +# The script interprets a "/" in a pagename as a namespace delimiter and +# creates and fills namespace subdirectories accordingly. +# +# version 0.1 02.2010 Slim Gaillard, based on the "extended python" +# convert.py script here: +# http://www.dokuwiki.org/tips:moinmoin2doku +# +import sys, os, os.path, re, pdb from os import listdir from os.path import isdir, basename @@ -13,14 +36,16 @@ def check_dirs(moin_pages_dir, output_dir): print >> sys.stderr, "Output directory doesn't exist!" sys.exit(1) -def get_page_names(moin_pages_dir): +def get_path_names(moin_pages_dir): items = listdir(moin_pages_dir) - pages = [] + pathnames = [] + for item in items: item = os.path.join(moin_pages_dir, item) if isdir(item): - pages.append(item) - return pages + pathnames.append(item) + + return pathnames def get_current_revision(page_dir): rev_dir = os.path.join(page_dir, 'revisions') @@ -34,8 +59,10 @@ def copy_attachments(page_dir, attachment_dir): dir = os.path.join(page_dir,'attachments') if isdir(dir): attachments = listdir(dir) + #pdb.set_trace() for attachment in attachments: - os.system ('cp "' + dir +'/' + attachment + '" "' + attachment_dir +'"') + cmd_string = 'cp "' + dir +'/' + attachment + '" "' + attachment_dir + attachment.lower() + '"' + os.system ( cmd_string ) def convert_page(page, file): namespace = ':' @@ -43,30 +70,32 @@ def convert_page(page, file): namespace += file[i] + ':' regexp = ( - ('\[\[TableOfContents.*\]\]', ''), # remove + ('\[\[TableOfContents.*\]\]', ''), # remove ('\[\[BR\]\]$', ''), # newline at end of line - remove ('\[\[BR\]\]', '\n'), # newline ('#pragma section-numbers off', ''), # remove ('^##.*?\\n', ''), # remove - ('\[:(.*):', '[[\\1]] '), # internal link - ('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'), - ('(\[\[.*\]\]).*\]', '\\1'), - ('\[(http.*) .*\]', '[[\\1]]'), # web link + ('\["', '[['), # internal link open + ('"\]', ']]'), # internal link close + #('\[:(.*):', '[[\\1]] '), # original internal link expressions + #('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'), + #('(\[\[.*\]\]).*\]', '\\1'), + ('\[(http.*) .*\]', '[[\\1]]'), # web link ('\["/(.*)"\]', '[['+file[-1]+':\\1]]'), - ('\{{3}', '<>code>'), # code open - ('\}{3}', '<>/code>'), # code close + ('\{{3}', '<>code>'), # code open + ('\}{3}', '<>/code>'), # code close ('^\s\s\s\s\*', ' *'), ('^\s\s\s\*', ' *'), ('^\s\s\*', ' *'), - ('^\s\*', ' *'), # lists must have not only but 2 whitespaces before * + ('^\s\*', ' *'), # lists must have 2 whitespaces before the asterisk ('^\s\s\s\s1\.', ' -'), ('^\s\s1\.', ' -'), ('^\s1\.', ' -'), ('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5 - ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4 + ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4 ('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3 ('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2 - ('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1 + ('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1 ('=-', '='), ('\|{2}', '|'), # table separator ('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic @@ -93,70 +122,105 @@ def print_parameter_error(): print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.' sys.exit(1) -if __name__ == '__main__': - if len(sys.argv) > 1: - if sys.argv[1] in ('-h', '--help'): - print_help() - elif len(sys.argv) > 2: - moin_pages_dir = sys.argv[1] - output_dir = sys.argv[2] - else: - print_parameter_error() +def fix_name( filename ): + filename = filename.lower() + filename = filename.replace('(2d)', '-') # hyphen + filename = filename.replace('(20)', '_') # space->underscore + filename = filename.replace('(2e)', '_') # decimal point->underscore + filename = filename.replace('(29)', '_') # )->underscore + filename = filename.replace('(28)', '_') # (->underscore + filename = filename.replace('.', '_') # decimal point->underscore + filename = filename.replace('(2c20)', '_') # comma + space->underscore + filename = filename.replace('(2028)', '_') # space + (->underscore + filename = filename.replace('(2920)', '_') # ) + space->underscore + filename = filename.replace('(2220)', 'inch_') # " + space->inch + underscore + filename = filename.replace('(3a20)', '_') # : + space->underscore + filename = filename.replace('(202827)', '_') # space+(+'->underscore + filename = filename.replace('(2720)', '_') # '+ space->underscore + filename = filename.replace('(c3bc)', 'ue') # umlaut + filename = filename.replace('(c384)', 'Ae') # umlaut + filename = filename.replace('(c3a4)', 'ae') # umlaut + filename = filename.replace('(c3b6)', 'oe') # umlaut + return filename + +# +# "main" starts here +# +if len(sys.argv) > 1: + if sys.argv[1] in ('-h', '--help'): + print_help() + elif len(sys.argv) > 2: + moin_pages_dir = sys.argv[1] + output_dir = sys.argv[2] else: print_parameter_error() +else: + print_parameter_error() - check_dirs(moin_pages_dir, output_dir) +check_dirs(moin_pages_dir, output_dir) - print 'Input dir is: %s.' % moin_pages_dir - print 'Output dir is: %s.' % output_dir - print +print 'Input dir is: %s.' % moin_pages_dir +print 'Output dir is: %s.' % output_dir - pages = get_page_names(moin_pages_dir) - for page in pages: - curr_rev = get_current_revision(page) - if os.path.exists(curr_rev): - page_name = basename(page).lower() - curr_rev_desc = file(curr_rev, 'r') - curr_rev_content = curr_rev_desc.readlines() - curr_rev_desc.close() +pathnames = get_path_names(moin_pages_dir) - if 'moineditorbackup' not in page_name: #dont convert backups - page_name = page_name.replace('(2d)', '-') - page_name = page_name.replace('(c3bc)', 'ue') - page_name = page_name.replace('(c384)', 'Ae') - page_name = page_name.replace('(c3a4)', 'ae') - page_name = page_name.replace('(c3b6)', 'oe') +for pathname in pathnames: + #pdb.set_trace() # start debugging here - split = page_name.split('(2f)') # namespaces - count = len(split) - dateiname = split[-1] + curr_rev = get_current_revision( pathname ) + if not os.path.exists( curr_rev ) : continue - dir = output_dir - attachment_dir = output_dir + '../media/' - if count == 1: - dir += 'unsorted' - if not isdir (dir): - os.mkdir(dir) - attachment_dir += 'unsorted/' - if not isdir (attachment_dir): - os.mkdir(attachment_dir) - for i in range(0, count - 1): - dir += split[i] + '/' - if not isdir (dir): - os.mkdir(dir) - attachment_dir += split[i] + '/' - if not isdir (attachment_dir): - os.mkdir(attachment_dir) - if count == 1: - str = 'unsorted/' + page_name - split = str.split('/') - curr_rev_content = convert_page(curr_rev_content, split) - else: - curr_rev_content = convert_page(curr_rev_content, split) + page_name = basename(pathname) + if page_name.count('MoinEditorBackup') > 0 : continue # don't convert backups - out_file = os.path.join(dir, dateiname + '.txt') - out_desc = file(out_file, 'w') - out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it]) + curr_rev_desc = file(curr_rev, 'r') + curr_rev_content = curr_rev_desc.readlines() + curr_rev_desc.close() - out_desc.close() - copy_attachments(page, attachment_dir) + page_name = fix_name( page_name ) + + split = page_name.split('(2f)') # namespaces + + count = len(split) + + dateiname = split[-1] + + dir = output_dir + # changed from attachment_dir = output_dir + '../media/': + attachment_dir = output_dir + 'media/' + if not isdir (attachment_dir): + os.mkdir(attachment_dir) + + if count == 1: + dir += 'unsorted' + if not isdir (dir): + os.mkdir(dir) + + attachment_dir += 'unsorted/' + if not isdir (attachment_dir): + os.mkdir(attachment_dir) + + for i in range(0, count - 1): + + dir += split[i] + '/' + if not isdir (dir): + os.mkdir(dir) + + attachment_dir += split[i] + '/' + if not isdir (attachment_dir): + os.mkdir(attachment_dir) + + if count == 1: + str = 'unsorted/' + page_name + split = str.split('/') + curr_rev_content = convert_page(curr_rev_content, split) + else: + curr_rev_content = convert_page(curr_rev_content, split) + + out_file = os.path.join(dir, dateiname + '.txt') + out_desc = file(out_file, 'w') + out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it]) + out_desc.close() + + # pdb.set_trace() # start debugging here + copy_attachments(pathname, attachment_dir)