- merge python code v2
http://www.dokuwiki.org/tips:moinmoin2doku?rev=1265494867#another_python_script
This commit is contained in:
parent
5f64d725ee
commit
c0629ccb89
1 changed files with 138 additions and 74 deletions
212
moin2doku.py
212
moin2doku.py
|
@ -1,6 +1,29 @@
|
||||||
#!/usr/bin/python
|
#!/usr/bin/python
|
||||||
import sys, os, os.path
|
#
|
||||||
import re
|
# moin2doku.py
|
||||||
|
#
|
||||||
|
# A script for converting MoinMoin version 1.3+ wiki data to DokuWiki format.
|
||||||
|
# Call with the name of the directory containing the MoinMoin pages and that
|
||||||
|
# of the directory to receive the DokuWiki pages on the command line:
|
||||||
|
#
|
||||||
|
# python moin2doku.py ./moin/data/pages/ ./doku/
|
||||||
|
#
|
||||||
|
# then move the doku pages to e.g. /var/www/MyWikiName/data/pages/,
|
||||||
|
# move the media files to e.g. /var/www/MyWikiName/data/media/,
|
||||||
|
# set ownership: chown -R www-data:www-data /var/www/MyWikiName/data/pages/*
|
||||||
|
# chown -R www-data:www-data /var/www/MyWikiName/data/media/*
|
||||||
|
#
|
||||||
|
# This script doesn't do all the work, and some of the work it does is
|
||||||
|
# wrong. For instance attachment links end up with the trailing "|}}"
|
||||||
|
# on the line following the link. This works, but doesn't look good.
|
||||||
|
# The script interprets a "/" in a pagename as a namespace delimiter and
|
||||||
|
# creates and fills namespace subdirectories accordingly.
|
||||||
|
#
|
||||||
|
# version 0.1 02.2010 Slim Gaillard, based on the "extended python"
|
||||||
|
# convert.py script here:
|
||||||
|
# http://www.dokuwiki.org/tips:moinmoin2doku
|
||||||
|
#
|
||||||
|
import sys, os, os.path, re, pdb
|
||||||
from os import listdir
|
from os import listdir
|
||||||
from os.path import isdir, basename
|
from os.path import isdir, basename
|
||||||
|
|
||||||
|
@ -13,14 +36,16 @@ def check_dirs(moin_pages_dir, output_dir):
|
||||||
print >> sys.stderr, "Output directory doesn't exist!"
|
print >> sys.stderr, "Output directory doesn't exist!"
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
def get_page_names(moin_pages_dir):
|
def get_path_names(moin_pages_dir):
|
||||||
items = listdir(moin_pages_dir)
|
items = listdir(moin_pages_dir)
|
||||||
pages = []
|
pathnames = []
|
||||||
|
|
||||||
for item in items:
|
for item in items:
|
||||||
item = os.path.join(moin_pages_dir, item)
|
item = os.path.join(moin_pages_dir, item)
|
||||||
if isdir(item):
|
if isdir(item):
|
||||||
pages.append(item)
|
pathnames.append(item)
|
||||||
return pages
|
|
||||||
|
return pathnames
|
||||||
|
|
||||||
def get_current_revision(page_dir):
|
def get_current_revision(page_dir):
|
||||||
rev_dir = os.path.join(page_dir, 'revisions')
|
rev_dir = os.path.join(page_dir, 'revisions')
|
||||||
|
@ -34,8 +59,10 @@ def copy_attachments(page_dir, attachment_dir):
|
||||||
dir = os.path.join(page_dir,'attachments')
|
dir = os.path.join(page_dir,'attachments')
|
||||||
if isdir(dir):
|
if isdir(dir):
|
||||||
attachments = listdir(dir)
|
attachments = listdir(dir)
|
||||||
|
#pdb.set_trace()
|
||||||
for attachment in attachments:
|
for attachment in attachments:
|
||||||
os.system ('cp "' + dir +'/' + attachment + '" "' + attachment_dir +'"')
|
cmd_string = 'cp "' + dir +'/' + attachment + '" "' + attachment_dir + attachment.lower() + '"'
|
||||||
|
os.system ( cmd_string )
|
||||||
|
|
||||||
def convert_page(page, file):
|
def convert_page(page, file):
|
||||||
namespace = ':'
|
namespace = ':'
|
||||||
|
@ -43,30 +70,32 @@ def convert_page(page, file):
|
||||||
namespace += file[i] + ':'
|
namespace += file[i] + ':'
|
||||||
|
|
||||||
regexp = (
|
regexp = (
|
||||||
('\[\[TableOfContents.*\]\]', ''), # remove
|
('\[\[TableOfContents.*\]\]', ''), # remove
|
||||||
('\[\[BR\]\]$', ''), # newline at end of line - remove
|
('\[\[BR\]\]$', ''), # newline at end of line - remove
|
||||||
('\[\[BR\]\]', '\n'), # newline
|
('\[\[BR\]\]', '\n'), # newline
|
||||||
('#pragma section-numbers off', ''), # remove
|
('#pragma section-numbers off', ''), # remove
|
||||||
('^##.*?\\n', ''), # remove
|
('^##.*?\\n', ''), # remove
|
||||||
('\[:(.*):', '[[\\1]] '), # internal link
|
('\["', '[['), # internal link open
|
||||||
('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'),
|
('"\]', ']]'), # internal link close
|
||||||
('(\[\[.*\]\]).*\]', '\\1'),
|
#('\[:(.*):', '[[\\1]] '), # original internal link expressions
|
||||||
('\[(http.*) .*\]', '[[\\1]]'), # web link
|
#('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'),
|
||||||
|
#('(\[\[.*\]\]).*\]', '\\1'),
|
||||||
|
('\[(http.*) .*\]', '[[\\1]]'), # web link
|
||||||
('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
|
('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
|
||||||
('\{{3}', '<>code>'), # code open
|
('\{{3}', '<>code>'), # code open
|
||||||
('\}{3}', '<>/code>'), # code close
|
('\}{3}', '<>/code>'), # code close
|
||||||
('^\s\s\s\s\*', ' *'),
|
('^\s\s\s\s\*', ' *'),
|
||||||
('^\s\s\s\*', ' *'),
|
('^\s\s\s\*', ' *'),
|
||||||
('^\s\s\*', ' *'),
|
('^\s\s\*', ' *'),
|
||||||
('^\s\*', ' *'), # lists must have not only but 2 whitespaces before *
|
('^\s\*', ' *'), # lists must have 2 whitespaces before the asterisk
|
||||||
('^\s\s\s\s1\.', ' -'),
|
('^\s\s\s\s1\.', ' -'),
|
||||||
('^\s\s1\.', ' -'),
|
('^\s\s1\.', ' -'),
|
||||||
('^\s1\.', ' -'),
|
('^\s1\.', ' -'),
|
||||||
('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5
|
('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5
|
||||||
('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4
|
('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4
|
||||||
('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3
|
('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3
|
||||||
('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2
|
('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2
|
||||||
('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1
|
('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1
|
||||||
('=-', '='),
|
('=-', '='),
|
||||||
('\|{2}', '|'), # table separator
|
('\|{2}', '|'), # table separator
|
||||||
('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic
|
('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic
|
||||||
|
@ -93,70 +122,105 @@ def print_parameter_error():
|
||||||
print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
|
print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def fix_name( filename ):
|
||||||
if len(sys.argv) > 1:
|
filename = filename.lower()
|
||||||
if sys.argv[1] in ('-h', '--help'):
|
filename = filename.replace('(2d)', '-') # hyphen
|
||||||
print_help()
|
filename = filename.replace('(20)', '_') # space->underscore
|
||||||
elif len(sys.argv) > 2:
|
filename = filename.replace('(2e)', '_') # decimal point->underscore
|
||||||
moin_pages_dir = sys.argv[1]
|
filename = filename.replace('(29)', '_') # )->underscore
|
||||||
output_dir = sys.argv[2]
|
filename = filename.replace('(28)', '_') # (->underscore
|
||||||
else:
|
filename = filename.replace('.', '_') # decimal point->underscore
|
||||||
print_parameter_error()
|
filename = filename.replace('(2c20)', '_') # comma + space->underscore
|
||||||
|
filename = filename.replace('(2028)', '_') # space + (->underscore
|
||||||
|
filename = filename.replace('(2920)', '_') # ) + space->underscore
|
||||||
|
filename = filename.replace('(2220)', 'inch_') # " + space->inch + underscore
|
||||||
|
filename = filename.replace('(3a20)', '_') # : + space->underscore
|
||||||
|
filename = filename.replace('(202827)', '_') # space+(+'->underscore
|
||||||
|
filename = filename.replace('(2720)', '_') # '+ space->underscore
|
||||||
|
filename = filename.replace('(c3bc)', 'ue') # umlaut
|
||||||
|
filename = filename.replace('(c384)', 'Ae') # umlaut
|
||||||
|
filename = filename.replace('(c3a4)', 'ae') # umlaut
|
||||||
|
filename = filename.replace('(c3b6)', 'oe') # umlaut
|
||||||
|
return filename
|
||||||
|
|
||||||
|
#
|
||||||
|
# "main" starts here
|
||||||
|
#
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
if sys.argv[1] in ('-h', '--help'):
|
||||||
|
print_help()
|
||||||
|
elif len(sys.argv) > 2:
|
||||||
|
moin_pages_dir = sys.argv[1]
|
||||||
|
output_dir = sys.argv[2]
|
||||||
else:
|
else:
|
||||||
print_parameter_error()
|
print_parameter_error()
|
||||||
|
else:
|
||||||
|
print_parameter_error()
|
||||||
|
|
||||||
check_dirs(moin_pages_dir, output_dir)
|
check_dirs(moin_pages_dir, output_dir)
|
||||||
|
|
||||||
print 'Input dir is: %s.' % moin_pages_dir
|
print 'Input dir is: %s.' % moin_pages_dir
|
||||||
print 'Output dir is: %s.' % output_dir
|
print 'Output dir is: %s.' % output_dir
|
||||||
print
|
|
||||||
|
|
||||||
pages = get_page_names(moin_pages_dir)
|
pathnames = get_path_names(moin_pages_dir)
|
||||||
for page in pages:
|
|
||||||
curr_rev = get_current_revision(page)
|
|
||||||
if os.path.exists(curr_rev):
|
|
||||||
page_name = basename(page).lower()
|
|
||||||
curr_rev_desc = file(curr_rev, 'r')
|
|
||||||
curr_rev_content = curr_rev_desc.readlines()
|
|
||||||
curr_rev_desc.close()
|
|
||||||
|
|
||||||
if 'moineditorbackup' not in page_name: #dont convert backups
|
for pathname in pathnames:
|
||||||
page_name = page_name.replace('(2d)', '-')
|
#pdb.set_trace() # start debugging here
|
||||||
page_name = page_name.replace('(c3bc)', 'ue')
|
|
||||||
page_name = page_name.replace('(c384)', 'Ae')
|
|
||||||
page_name = page_name.replace('(c3a4)', 'ae')
|
|
||||||
page_name = page_name.replace('(c3b6)', 'oe')
|
|
||||||
|
|
||||||
split = page_name.split('(2f)') # namespaces
|
curr_rev = get_current_revision( pathname )
|
||||||
count = len(split)
|
if not os.path.exists( curr_rev ) : continue
|
||||||
dateiname = split[-1]
|
|
||||||
|
|
||||||
dir = output_dir
|
page_name = basename(pathname)
|
||||||
attachment_dir = output_dir + '../media/'
|
if page_name.count('MoinEditorBackup') > 0 : continue # don't convert backups
|
||||||
if count == 1:
|
|
||||||
dir += 'unsorted'
|
|
||||||
if not isdir (dir):
|
|
||||||
os.mkdir(dir)
|
|
||||||
attachment_dir += 'unsorted/'
|
|
||||||
if not isdir (attachment_dir):
|
|
||||||
os.mkdir(attachment_dir)
|
|
||||||
for i in range(0, count - 1):
|
|
||||||
dir += split[i] + '/'
|
|
||||||
if not isdir (dir):
|
|
||||||
os.mkdir(dir)
|
|
||||||
attachment_dir += split[i] + '/'
|
|
||||||
if not isdir (attachment_dir):
|
|
||||||
os.mkdir(attachment_dir)
|
|
||||||
if count == 1:
|
|
||||||
str = 'unsorted/' + page_name
|
|
||||||
split = str.split('/')
|
|
||||||
curr_rev_content = convert_page(curr_rev_content, split)
|
|
||||||
else:
|
|
||||||
curr_rev_content = convert_page(curr_rev_content, split)
|
|
||||||
|
|
||||||
out_file = os.path.join(dir, dateiname + '.txt')
|
curr_rev_desc = file(curr_rev, 'r')
|
||||||
out_desc = file(out_file, 'w')
|
curr_rev_content = curr_rev_desc.readlines()
|
||||||
out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
|
curr_rev_desc.close()
|
||||||
|
|
||||||
out_desc.close()
|
page_name = fix_name( page_name )
|
||||||
copy_attachments(page, attachment_dir)
|
|
||||||
|
split = page_name.split('(2f)') # namespaces
|
||||||
|
|
||||||
|
count = len(split)
|
||||||
|
|
||||||
|
dateiname = split[-1]
|
||||||
|
|
||||||
|
dir = output_dir
|
||||||
|
# changed from attachment_dir = output_dir + '../media/':
|
||||||
|
attachment_dir = output_dir + 'media/'
|
||||||
|
if not isdir (attachment_dir):
|
||||||
|
os.mkdir(attachment_dir)
|
||||||
|
|
||||||
|
if count == 1:
|
||||||
|
dir += 'unsorted'
|
||||||
|
if not isdir (dir):
|
||||||
|
os.mkdir(dir)
|
||||||
|
|
||||||
|
attachment_dir += 'unsorted/'
|
||||||
|
if not isdir (attachment_dir):
|
||||||
|
os.mkdir(attachment_dir)
|
||||||
|
|
||||||
|
for i in range(0, count - 1):
|
||||||
|
|
||||||
|
dir += split[i] + '/'
|
||||||
|
if not isdir (dir):
|
||||||
|
os.mkdir(dir)
|
||||||
|
|
||||||
|
attachment_dir += split[i] + '/'
|
||||||
|
if not isdir (attachment_dir):
|
||||||
|
os.mkdir(attachment_dir)
|
||||||
|
|
||||||
|
if count == 1:
|
||||||
|
str = 'unsorted/' + page_name
|
||||||
|
split = str.split('/')
|
||||||
|
curr_rev_content = convert_page(curr_rev_content, split)
|
||||||
|
else:
|
||||||
|
curr_rev_content = convert_page(curr_rev_content, split)
|
||||||
|
|
||||||
|
out_file = os.path.join(dir, dateiname + '.txt')
|
||||||
|
out_desc = file(out_file, 'w')
|
||||||
|
out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
|
||||||
|
out_desc.close()
|
||||||
|
|
||||||
|
# pdb.set_trace() # start debugging here
|
||||||
|
copy_attachments(pathname, attachment_dir)
|
||||||
|
|
Loading…
Reference in a new issue