#!/bin/sh # original data QUALITY_URL="http://www.gary68.de/osm/qa/unmapped/mecklenburg-vorpommern.htm" # where to get the number of inhabitants WIKIPEDIA_URL="http://de.wikipedia.org/wiki" PLACE_TYPE="(town|city)" # some wikipedia pages have different names # remove ALL non-letters (7-bit) characters from the _first_ column PLACE_NAME_MAPPING="Feldberg Feldberger_Seenlandschaft BoizenburgElbe Boizenburg FrstenbergHavel F%C3%BCrstenberg/Havel Goldberg Goldberg_(Mecklenburg) Malchow Malchow_(Mecklenburg) OstseebadRerik Rerik RbelMritz Röbel Strasburg Strasburg_(Uckermark) Tessin Tessin_(bei_Rostock) Wesenberg Wesenberg_(Mecklenburg) Zarrentin Amt_Zarrentin" get_place_data() { echo '' wget --quiet --output-document - "$QUALITY_URL" \ | sed -n "1,/Details all information/p" \ | grep -E -A 4 -B 2 "^$" \ | grep -v "^--$" echo '
$PLACE_TYPE
' } get_place_size() { wget --quiet --output-document - "$WIKIPEDIA_URL/$1" \ | grep -A 1 "^Einwohner:$" \ | tail -1 | cut -d ">" -f 2 | cut -d "<" -f 1 \ | sed s/[^0-9]//g } normalize_place_name() { local simple_name="$(echo "$1" | sed s/[^a-zA-Z]//g)" local new_name="$(echo "$PLACE_NAME_MAPPING" | grep "^$simple_name" | cut -f 2)" if test -z "$new_name" then echo "$1" else echo "$new_name" fi } get_place_inhabitants() { local previous_line local place_name local place_size local place_nodes while read line do if echo "$line" | grep -q -E "$PLACE_TYPE" then place_name="$(echo "$previous_line" | cut -d ">" -f 2 | cut -d "<" -f 1)" place_name="$(normalize_place_name "$place_name")" place_size="$(get_place_size "$place_name")" else if echo "$line" | grep -q "^$" then echo "$place_size" place_nodes="$(echo "$previous_line" | cut -d ">" -f 2 | cut -d "<" -f 1)" echo "$(echo "scale=2;$place_size/$place_nodes" | bc)" fi fi echo "$line" previous_line="$line" done } # we need the default locales - otherwise 'sed' will not filter umlauts export LANG= get_place_data | get_place_inhabitants