This commit is contained in:
Elan Ruusamäe 2011-02-07 00:14:47 +02:00
parent 6d527beb68
commit acf08e8fe8

View file

@ -33,27 +33,27 @@ from os import listdir
from os.path import isdir, basename from os.path import isdir, basename
def check_dirs(moin_pages_dir, output_dir): def check_dirs(moin_pages_dir, output_dir):
if not isdir(moin_pages_dir): if not isdir(moin_pages_dir):
print >> sys.stderr, "MoinMoin pages directory doesn't exist!" print >> sys.stderr, "MoinMoin pages directory doesn't exist!"
sys.exit(1) sys.exit(1)
if not isdir(output_dir): if not isdir(output_dir):
print >> sys.stderr, "Output directory doesn't exist!" print >> sys.stderr, "Output directory doesn't exist!"
sys.exit(1) sys.exit(1)
def get_path_names(moin_pages_dir): def get_path_names(moin_pages_dir):
items = listdir(moin_pages_dir) items = listdir(moin_pages_dir)
pathnames = [] pathnames = []
for item in items: for item in items:
item = os.path.join(moin_pages_dir, item) item = os.path.join(moin_pages_dir, item)
if isdir(item): if isdir(item):
pathnames.append(item) pathnames.append(item)
return pathnames return pathnames
def readfile(filename): def readfile(filename):
return file(filename, 'r').readlines() return file(filename, 'r').readlines()
def writefile(filename, content, overwrite=False): def writefile(filename, content, overwrite=False):
dir = os.path.split(filename)[0] dir = os.path.split(filename)[0]
@ -109,184 +109,184 @@ def copy_attachments(page_dir, attachment_dir):
os.system(cmd_string) os.system(cmd_string)
def convert_markup(content, filename): def convert_markup(content, filename):
""" """
convert page markup convert page markup
""" """
namespace = ':' namespace = ':'
for i in range(0, len(filename) - 1): for i in range(0, len(filename) - 1):
namespace += filename[i] + ':' namespace += filename[i] + ':'
# http://www.pld-linux.org/SyntaxReference # http://www.pld-linux.org/SyntaxReference
regexp = ( regexp = (
('\[\[TableOfContents.*\]\]', ''), # remove ('\[\[TableOfContents.*\]\]', ''), # remove
('\[\[BR\]\]$', ''), # newline at end of line - remove ('\[\[BR\]\]$', ''), # newline at end of line - remove
('\[\[BR\]\]', '\n'), # newline ('\[\[BR\]\]', '\n'), # newline
('#pragma section-numbers off', ''), # remove ('#pragma section-numbers off', ''), # remove
('^##.*?\\n', ''), # comments: remove ('^##.*?\\n', ''), # comments: remove
('^#(pragma|format|redirect|refresh|language|acl)(.*?)\n', ''), # remove all ('^#(pragma|format|redirect|refresh|language|acl)(.*?)\n', ''), # remove all
('^#deprecated(.*)\n', '<note warning>This page is deprecated<note>\n'), # deprecated ('^#deprecated(.*)\n', '<note warning>This page is deprecated<note>\n'), # deprecated
# Other elements # Other elements
# break # break
('(<<BR>>)|(\[\[BR]])', '\\\\ '), ('(<<BR>>)|(\[\[BR]])', '\\\\ '),
# horizontal line # horizontal line
('^\s*-{4,}\s*$', '----\n'), ('^\s*-{4,}\s*$', '----\n'),
# Macros and another foolish - simply remove # Macros and another foolish - simply remove
# macros # macros
('<<.+?>>', ''), ('<<.+?>>', ''),
('\[\[Anchor\(\w+\)\]\]', ''), ('\[\[Anchor\(\w+\)\]\]', ''),
('\[\[(PageCount|RandomPage)\]\]', ''), ('\[\[(PageCount|RandomPage)\]\]', ''),
# ('\["', '[['), # internal link open # ('\["', '[['), # internal link open
# ('"\]', ']]'), # internal link close # ('"\]', ']]'), # internal link close
# internal links # internal links
('\[:(.+)\]', '[[\\1]]'), ('\[:(.+)\]', '[[\\1]]'),
# TODO: handle more depths # TODO: handle more depths
('\[\[(.*)/(.*)\]\]', 'B[[\\1:\\2]]'), ('\[\[(.*)/(.*)\]\]', 'B[[\\1:\\2]]'),
# wiki:xxx # wiki:xxx
('\[wiki:([^\s]+)\s+(.+)]', '[[\\1|\\2]]'), ('\[wiki:([^\s]+)\s+(.+)]', '[[\\1|\\2]]'),
('wiki:([^\s]+)\s+(.+)', '[[\\1|\\2]]'), ('wiki:([^\s]+)\s+(.+)', '[[\\1|\\2]]'),
('wiki:([^\s]+)', '[[\\1]]'), ('wiki:([^\s]+)', '[[\\1]]'),
('(\[\[.+\]\]).*\]', '\\1'), ('(\[\[.+\]\]).*\]', '\\1'),
# web link without title # web link without title
('\[((?:http|https|file)[^\s]+)\]', '[[\\1]]'), ('\[((?:http|https|file)[^\s]+)\]', '[[\\1]]'),
# web link with title # web link with title
('\[((?:http|https|file)[^\s]+)\s+(.+?)\]', '[[\\1|\\2]]'), ('\[((?:http|https|file)[^\s]+)\s+(.+?)\]', '[[\\1|\\2]]'),
('\["/(.*)"\]', '[['+filename[-1]+':\\1]]'), ('\["/(.*)"\]', '[['+filename[-1]+':\\1]]'),
# code blocks # code blocks
# open and language # open and language
('\{{3}#!(python|php)', '<'+'code \\1>'), ('\{{3}#!(python|php)', '<'+'code \\1>'),
# code open # code open
('\{{3}', '<'+'code>'), ('\{{3}', '<'+'code>'),
# close # close
('\}{3}', '<'+'/code>'), ('\}{3}', '<'+'/code>'),
('^\s\s\s\s\*', ' *'), ('^\s\s\s\s\*', ' *'),
('^\s\s\s\*', ' *'), ('^\s\s\s\*', ' *'),
('^\s\s\*', ' *'), ('^\s\s\*', ' *'),
('^\s\*', ' *'), # lists must have 2 whitespaces before the asterisk ('^\s\*', ' *'), # lists must have 2 whitespaces before the asterisk
('^\s\s\s\s1\.', ' -'), ('^\s\s\s\s1\.', ' -'),
('^\s\s1\.', ' -'), ('^\s\s1\.', ' -'),
('^\s1\.', ' -'), ('^\s1\.', ' -'),
('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5 ('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5
('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4 ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4
('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3 ('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3
('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2 ('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2
('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1 ('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1
('=-', '='), ('=-', '='),
('\|{2}', '|'), # table separator ('\|{2}', '|'), # table separator
('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic ('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic
('\'{3}(.*)\'{3}', '**\\1**'), # bold ('\'{3}(.*)\'{3}', '**\\1**'), # bold
('\'{2}(.*)\'{2}', '//\\1//'), # italic ('\'{2}(.*)\'{2}', '//\\1//'), # italic
('`(.*?)`', "''\\1''"), # monospaced ('`(.*?)`', "''\\1''"), # monospaced
('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'), # CamelCase, dont change if CamelCase is in InternalLink ('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'), # CamelCase, dont change if CamelCase is in InternalLink
('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1'), # Date value ('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1'), # Date value
('attachment:(.*)','{{'+namespace+'\\1|}}') ('attachment:(.*)','{{'+namespace+'\\1|}}')
) )
for i in range(len(content)): for i in range(len(content)):
line = content[i] line = content[i]
for item in regexp: for item in regexp:
line = re.sub(item[0], item[1], line) line = re.sub(item[0], item[1], line)
content[i] = line content[i] = line
return content return content
def print_help(): def print_help():
print "Usage: moinconv.py <moinmoin pages directory> <output directory>" print "Usage: moinconv.py <moinmoin pages directory> <output directory>"
print "Convert MoinMoin pages to DokuWiki." print "Convert MoinMoin pages to DokuWiki."
print "Options:" print "Options:"
print "-o - overwrite output files" print "-o - overwrite output files"
print "-f FILE - convert signle file" print "-f FILE - convert signle file"
sys.exit(0) sys.exit(0)
def unquote(filename): def unquote(filename):
filename = filename.lower() filename = filename.lower()
filename = filename.replace('(2d)', '-') # hyphen filename = filename.replace('(2d)', '-') # hyphen
filename = filename.replace('(20)', '_') # space->underscore filename = filename.replace('(20)', '_') # space->underscore
filename = filename.replace('(2e)', '_') # decimal point->underscore filename = filename.replace('(2e)', '_') # decimal point->underscore
filename = filename.replace('(29)', '_') # )->underscore filename = filename.replace('(29)', '_') # )->underscore
filename = filename.replace('(28)', '_') # (->underscore filename = filename.replace('(28)', '_') # (->underscore
filename = filename.replace('.', '_') # decimal point->underscore filename = filename.replace('.', '_') # decimal point->underscore
filename = filename.replace('(2c20)', '_') # comma + space->underscore filename = filename.replace('(2c20)', '_') # comma + space->underscore
filename = filename.replace('(2028)', '_') # space + (->underscore filename = filename.replace('(2028)', '_') # space + (->underscore
filename = filename.replace('(2920)', '_') # ) + space->underscore filename = filename.replace('(2920)', '_') # ) + space->underscore
filename = filename.replace('(2220)', 'inch_') # " + space->inch + underscore filename = filename.replace('(2220)', 'inch_') # " + space->inch + underscore
filename = filename.replace('(3a20)', '_') # : + space->underscore filename = filename.replace('(3a20)', '_') # : + space->underscore
filename = filename.replace('(202827)', '_') # space+(+'->underscore filename = filename.replace('(202827)', '_') # space+(+'->underscore
filename = filename.replace('(2720)', '_') # '+ space->underscore filename = filename.replace('(2720)', '_') # '+ space->underscore
filename = filename.replace('(c3bc)', 'ue') # umlaut filename = filename.replace('(c3bc)', 'ue') # umlaut
filename = filename.replace('(c384)', 'Ae') # umlaut filename = filename.replace('(c384)', 'Ae') # umlaut
filename = filename.replace('(c3a4)', 'ae') # umlaut filename = filename.replace('(c3a4)', 'ae') # umlaut
filename = filename.replace('(c3b6)', 'oe') # umlaut filename = filename.replace('(c3b6)', 'oe') # umlaut
return filename return filename
def convertfile(pathname, overwrite = False): def convertfile(pathname, overwrite = False):
print "-> %s" % pathname print "-> %s" % pathname
curr_rev = get_current_revision(pathname) curr_rev = get_current_revision(pathname)
if curr_rev == None: if curr_rev == None:
print "SKIP %s: no current revision" % pathname print "SKIP %s: no current revision" % pathname
return return
if not os.path.exists(curr_rev): if not os.path.exists(curr_rev):
print "SKIP %s: filename missing" % curr_rev print "SKIP %s: filename missing" % curr_rev
return return
page_name = basename(pathname) page_name = basename(pathname)
if page_name.count('MoinEditorBackup') > 0: if page_name.count('MoinEditorBackup') > 0:
print "SKIP %s: skip backups" % pathname print "SKIP %s: skip backups" % pathname
return return
content = readfile(curr_rev) content = readfile(curr_rev)
page_name = unquote(page_name) page_name = unquote(page_name)
print "dokuname: %s" % page_name print "dokuname: %s" % page_name
# split by namespace separator # split by namespace separator
ns = page_name.split('(2f)') ns = page_name.split('(2f)')
count = len(ns) count = len(ns)
id = ns[-1] id = ns[-1]
dir = output_dir dir = output_dir
attachment_dir = os.path.join(output_dir, 'media') attachment_dir = os.path.join(output_dir, 'media')
# root namespace files go to "unsorted" # root namespace files go to "unsorted"
if count == 1: if count == 1:
ns.insert(0, 'unsorted') ns.insert(0, 'unsorted')
for p in ns[:-1]: for p in ns[:-1]:
dir = os.path.join(dir, p); dir = os.path.join(dir, p);
attachment_dir = os.path.join(attachment_dir, p); attachment_dir = os.path.join(attachment_dir, p);
content = convert_markup(content, ns) content = convert_markup(content, ns)
out_file = os.path.join(dir, id + '.txt') out_file = os.path.join(dir, id + '.txt')
writefile(out_file, content, overwrite = overwrite) writefile(out_file, content, overwrite = overwrite)
copy_attachments(pathname, attachment_dir) copy_attachments(pathname, attachment_dir)
return 1 return 1
# #
# "main" starts here # "main" starts here
# #
try: try:
opts, args = getopt.getopt(sys.argv[1:], 'hof:', [ "help" ]) opts, args = getopt.getopt(sys.argv[1:], 'hof:', [ "help" ])
except getopt.GetoptError, e: except getopt.GetoptError, e:
print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.: %s' % e print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.: %s' % e
sys.exit(1) sys.exit(1)
overwrite = False overwrite = False
inputfile = None inputfile = None
for o, a in opts: for o, a in opts:
if o == "--help" or o == "-h": if o == "--help" or o == "-h":
print_help() print_help()
if o == "-o": if o == "-o":
overwrite = True overwrite = True
if o == "-f": if o == "-f":
inputfile = a inputfile = a
if len(args) != 2: if len(args) != 2:
print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.' print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
@ -305,7 +305,7 @@ else:
pathnames = get_path_names(moin_pages_dir) pathnames = get_path_names(moin_pages_dir)
converted = 0 converted = 0
for pathname in pathnames: for pathname in pathnames:
res = convertfile(pathname, overwrite = overwrite) res = convertfile(pathname, overwrite = overwrite)
if res != None: if res != None:
converted += 1 converted += 1
print "Processed %d files, converted %d" % (len(pathnames), converted) print "Processed %d files, converted %d" % (len(pathnames), converted)