cryptonas-livecd/scripts/mirror_offline_doc.sh

#!/bin/sh

PROJ_DIR=$(dirname "$0")/..
PROJ_DIR=$(cd "$PROJ_DIR"; pwd)
DEST_DIR="$PROJ_DIR/live-cd-tree.d/_offline/doc"
WIKI_PAGES="doc/0.3/CryptoBoxUserGettingStarted/en
	doc/0.3/CryptoBoxUserDailyUse/en
	CryptoBoxUser/en
		faq"


function prepare_wiki_page()
{
	sed -i '1,/<div class="wikipage">/d' "$1"
	# remove the "comment" or "edit" stuff
	sed -i '/<h2>Comments<\/h2>/,$d' "$1"
	sed -i '/<div class="buttons">/,$d' "$1"
	# remove the last horizontal line
	sed -i '$,$d' "$1"
	# add anchor ids to every head line
	sed -i 's#<h\([1-4]\)>\(.*\)</h#<h\1 id="\2">\2</h#g' "$1"
	while grep -q '<h[1-4] id="\([a-zA-Z]*\)[^a-zA-Z"]' "$1"
	  do	sed -i 's#<h\([1-4]\) id="\([a-zA-Z]*\)[^a-zA-Z"]#<h\1 id="\2#g' "$1"
	 done
	# convert wiki links
	sed -i 's#="/wiki/\([^"/]*\)/#="/wiki/\1_#g' "$1"
	sed -i 's#="/wiki/\([^"/]*\)/#="/wiki/\1_#g' "$1"
	sed -i 's#="/wiki/\([^"/]*\)/#="/wiki/\1_#g' "$1"
	sed -i 's#="/wiki/\([^"/]*\)/#="/wiki/\1_#g' "$1"
	sed -i 's#="/wiki/\([^"#]*\)\([#"]\)#="\1.html\2#g' "$1"
	# remove outdated documentation
	sed -i 's#</ol>#</ol>\n#g' "$1"
	sed -i '/outdated/,/<\/ol>/d' "$1"
	# remove "searchable" ids (blue coloring of head lines)
	sed -i 's#<div id="searchable">#<div>#g' "$1"
	# fix image sources
	sed -i 's#src="/file/[^"]*/\([^/\?]*\)["\?]#src="\1"#g' "$1"
}


function wrap_wiki_page()
{
	# add header and footer
	(
		echo "$page_header"
		echo '<div class="centercontent">'
		cat "$1"
		echo '</div>'
		echo "$page_footer"
	) >"${1}.new"
	mv "${1}.new" "$1"
}


function rename_files()
{
	ls | grep "\?format=raw$" | while read fname
	  do	local real_name=$(echo "$fname" | sed 's/\?.*$//')
	  	mv "$fname" "$real_name"
	 done
	ls | grep "\.[0-9]*$" | while read fname
	  do	rm "$fname"
	 done
}


function redirect_homepage_links()
{
	ls *.html | while read fname
	  do	while grep -q '="http://devel.cryptobox.org/wiki/[^/"]*/' "$fname"
		  do	sed -i 's#\(="http://devel.cryptobox.org/wiki/[^/"]*\)/#\1_#g' "$fname"
		 done
		sed -i 's#="http://devel.cryptobox.org/wiki/\([^"\#]*\)\(["\#]\)#="\1.html\2"#g' "$fname"
	 done
}


function remove_useless_files()
{
	rm -f robots.txt
}


function rename_one_file()
{
	find . -type f | grep -v "/\.svn" | while read fname
	  do	sed -i "s#\([^a-zA-Z0-9]\)$1#\1$2#g" "$fname"
	 done
	mv "$1" "$2"
}


function rename_long_files()
{
	# this is necessary to avoid problems with the 31-character restriction of iso9660
	# on windows systems
	find . -type f  | grep -v "/\.svn" | while read fname
	  do	if test 14 -lt "${#fname}"
		  then	local neu_prefix=$(echo "${fname:2:6}" | sed 's#/#_#g')
		  	local neu_num=0
			local neu_suffix=$(echo "$fname"| sed 's#^.*\.\([^\.]*\)$#.\1#g')
			test 5 -lt "${#neu_suffix}" && neu_suffix=.${neu_suffix:-4}
			while test -e "$neu_prefix$neu_num$neu_suffix"
			  do	local i=$((neu_num+1))
				# we should use the additional step (using 'i') to avoid a bash-specific
				# handling of self-increment
			  	neu_num=$i
			 done
			rename_one_file "${fname:2}" "$neu_prefix$neu_num$neu_suffix"
		 fi
	 done
}


test ! -d "$DEST_DIR" && echo "Destination directory does not exist: $DEST_DIR" && exit 1

# cleanup destination directory
test -e "$DEST_DIR/index.html" && find "$DEST_DIR/" -type f | grep -v "/\.svn" | xargs rm

cd "$DEST_DIR"

# retrieve pages from cryptobox.org
wget --mirror --page-requisites --no-directories --convert-links --quiet http://cryptobox.org
wget --output-document=header.jpg --quiet http://cryptobox.org/gfx/header.jpg

page_header=$(sed -n '1,/<!-- content starts here -->/p' index.html)
page_footer=$(sed -n '/<!-- content ends here -->/,$p' index.html)

for page in $WIKI_PAGES
  do	filename=${page//\//_}.html
  	wget --no-directories --page-requisites --quiet "http://devel.cryptobox.org/wiki/$page"
	mv "$(basename $page)" "$filename"
	prepare_wiki_page "$filename" "$page"
	wrap_wiki_page "$filename"
 done

rename_files
redirect_homepage_links
remove_useless_files
rename_long_files