Elan Ruusamäe 2011-02-06 18:46:19 +02:00
parent 5bcbfc9a30
commit 5f64d725ee

View file

@ -1,3 +1,4 @@
#!/usr/bin/python
import sys, os, os.path import sys, os, os.path
import re import re
from os import listdir from os import listdir
@ -7,11 +8,11 @@ def check_dirs(moin_pages_dir, output_dir):
if not isdir(moin_pages_dir): if not isdir(moin_pages_dir):
print >> sys.stderr, "MoinMoin pages directory doesn't exist!" print >> sys.stderr, "MoinMoin pages directory doesn't exist!"
sys.exit(1) sys.exit(1)
if not isdir(output_dir): if not isdir(output_dir):
print >> sys.stderr, "Output directory doesn't exist!" print >> sys.stderr, "Output directory doesn't exist!"
sys.exit(1) sys.exit(1)
def get_page_names(moin_pages_dir): def get_page_names(moin_pages_dir):
items = listdir(moin_pages_dir) items = listdir(moin_pages_dir)
pages = [] pages = []
@ -21,38 +22,61 @@ def get_page_names(moin_pages_dir):
pages.append(item) pages.append(item)
return pages return pages
def get_current_revision(page_dir): def get_current_revision(page_dir):
rev_dir = os.path.join(page_dir, 'revisions') rev_dir = os.path.join(page_dir, 'revisions')
revisions = listdir(rev_dir) if isdir(rev_dir):
revisions.sort() revisions = listdir(rev_dir)
return os.path.join(rev_dir, revisions[-1]) revisions.sort()
return os.path.join(rev_dir, revisions[-1])
return ''
def copy_attachments(page_dir, attachment_dir):
dir = os.path.join(page_dir,'attachments')
if isdir(dir):
attachments = listdir(dir)
for attachment in attachments:
os.system ('cp "' + dir +'/' + attachment + '" "' + attachment_dir +'"')
def convert_page(page, file):
namespace = ':'
for i in range(0, len(file) - 1):
namespace += file[i] + ':'
def convert_page(page):
regexp = ( regexp = (
('\[\[TableOfContents\]\]', ''), # remove ('\[\[TableOfContents.*\]\]', ''), # remove
('\[\[BR\]\]$', ''), # newline at end of line - remove ('\[\[BR\]\]$', ''), # newline at end of line - remove
('\[\[BR\]\]', '\n'), # newline ('\[\[BR\]\]', '\n'), # newline
('#pragma section-numbers off', ''), # remove ('#pragma section-numbers off', ''), # remove
('^##.*?\\n', ''), # remove ('^##.*?\\n', ''), # remove
('\["(.*)"\]', '[[\\1]]'), # internal link ('\[:(.*):', '[[\\1]] '), # internal link
('(\[http.*\])', '[\\1]'), # web link ('\[\[(.*)/(.*)\]\]', '[[\\1:\\2]]'),
('(\[\[.*\]\]).*\]', '\\1'),
('\[(http.*) .*\]', '[[\\1]]'), # web link
('\["/(.*)"\]', '[['+file[-1]+':\\1]]'),
('\{{3}', '<>code>'), # code open ('\{{3}', '<>code>'), # code open
('\}{3}', '<>/code>'), # code close ('\}{3}', '<>/code>'), # code close
('^\s\s\s\s\*', ' *'),
('^\s\s\s\*', ' *'),
('^\s\s\*', ' *'),
('^\s\*', ' *'), # lists must have not only but 2 whitespaces before * ('^\s\*', ' *'), # lists must have not only but 2 whitespaces before *
('={5}(\s.*\s)={5}$', '==\\1=='), # heading 5 ('^\s\s\s\s1\.', ' -'),
('={4}(\s.*\s)={4}$', '===\\1}==='), # heading 4 ('^\s\s1\.', ' -'),
('={3}(\s.*\s)={3}$', '====\\1===='), # heading 3 ('^\s1\.', ' -'),
('={2}(\s.*\s)={2}$', '=====\\1====='), # heading 2 ('^\s*=====\s*(.*)\s*=====\s*$', '=-=- \\1 =-=-'), # heading 5
('={1}(\s.*\s)={1}$', '======\\1======'), # heading 1 ('^\s*====\s*(.*)\s*====\s*$', '=-=-=- \\1 =-=-=-'), # heading 4
('^\s*===\s*(.*)\s*===\s*$', '=-=-=-=- \\1 =-=-=-=-'), # heading 3
('^\s*==\s*(.*)\s*==\s*$', '=-=-=-=-=- \\1 =-=-=-=-=-'), # heading 2
('^\s*=\s*(.*)\s=\s*$', '=-=-=-=-=-=- \\1 =-=-=-=-=-=-'), # heading 1
('=-', '='),
('\|{2}', '|'), # table separator ('\|{2}', '|'), # table separator
('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic ('\'{5}(.*)\'{5}', '**//\\1//**'), # bold and italic
('\'{3}(.*)\'{3}', '**\\1**'), # bold ('\'{3}(.*)\'{3}', '**\\1**'), # bold
('\'{2}(.*)\'{2}', '//\\1//'), # italic ('\'{2}(.*)\'{2}', '//\\1//'), # italic
('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'), # CamelCase, dont change if CamelCase is in InternalLink ('(?<!\[)(\b[A-Z]+[a-z]+[A-Z][A-Za-z]*\b)','[[\\1]]'), # CamelCase, dont change if CamelCase is in InternalLink
('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1') # Date value ('\[\[Date\(([\d]{4}-[\d]{2}-[\d]{2}T[\d]{2}:[\d]{2}:[\d]{2}Z)\)\]\]', '\\1'), # Date value
('attachment:(.*)','{{'+namespace+'\\1|}}')
) )
for i in range(len(page)): for i in range(len(page)):
line = page[i] line = page[i]
for item in regexp: for item in regexp:
@ -60,18 +84,15 @@ def convert_page(page):
page[i] = line page[i] = line
return page return page
def print_help(): def print_help():
print "Usage: moinconv.py <moinmoin pages directory> <output directory>" print "Usage: moinconv.py <moinmoin pages directory> <output directory>"
print "Convert MoinMoin pages to DokuWiki." print "Convert MoinMoin pages to DokuWiki."
sys.exit(0) sys.exit(0)
def print_parameter_error(): def print_parameter_error():
print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.' print >> sys.stderr, 'Incorrect parameters! Use --help switch to learn more.'
sys.exit(1) sys.exit(1)
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: if len(sys.argv) > 1:
if sys.argv[1] in ('-h', '--help'): if sys.argv[1] in ('-h', '--help'):
@ -85,6 +106,7 @@ if __name__ == '__main__':
print_parameter_error() print_parameter_error()
check_dirs(moin_pages_dir, output_dir) check_dirs(moin_pages_dir, output_dir)
print 'Input dir is: %s.' % moin_pages_dir print 'Input dir is: %s.' % moin_pages_dir
print 'Output dir is: %s.' % output_dir print 'Output dir is: %s.' % output_dir
print print
@ -92,16 +114,49 @@ if __name__ == '__main__':
pages = get_page_names(moin_pages_dir) pages = get_page_names(moin_pages_dir)
for page in pages: for page in pages:
curr_rev = get_current_revision(page) curr_rev = get_current_revision(page)
curr_rev_desc = file(curr_rev, 'r') if os.path.exists(curr_rev):
curr_rev_content = curr_rev_desc.readlines() page_name = basename(page).lower()
curr_rev_desc.close() curr_rev_desc = file(curr_rev, 'r')
curr_rev_content = curr_rev_desc.readlines()
curr_rev_desc.close()
curr_rev_content = convert_page(curr_rev_content) if 'moineditorbackup' not in page_name: #dont convert backups
page_name = page_name.replace('(2d)', '-')
page_name = page_name.replace('(c3bc)', 'ue')
page_name = page_name.replace('(c384)', 'Ae')
page_name = page_name.replace('(c3a4)', 'ae')
page_name = page_name.replace('(c3b6)', 'oe')
page_name = basename(page).lower() split = page_name.split('(2f)') # namespaces
out_file = os.path.join(output_dir, page_name + '.txt') count = len(split)
out_desc = file(out_file, 'w') dateiname = split[-1]
out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
out_desc.close()
print 'Migrated %s to %s.' % (basename(page), basename(out_file)) dir = output_dir
attachment_dir = output_dir + '../media/'
if count == 1:
dir += 'unsorted'
if not isdir (dir):
os.mkdir(dir)
attachment_dir += 'unsorted/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
for i in range(0, count - 1):
dir += split[i] + '/'
if not isdir (dir):
os.mkdir(dir)
attachment_dir += split[i] + '/'
if not isdir (attachment_dir):
os.mkdir(attachment_dir)
if count == 1:
str = 'unsorted/' + page_name
split = str.split('/')
curr_rev_content = convert_page(curr_rev_content, split)
else:
curr_rev_content = convert_page(curr_rev_content, split)
out_file = os.path.join(dir, dateiname + '.txt')
out_desc = file(out_file, 'w')
out_desc.writelines([it.rstrip() + '\n' for it in curr_rev_content if it])
out_desc.close()
copy_attachments(page, attachment_dir)