codekasten/osm/dynamic_markers/scripts/convert_raw2gml.py

#!/usr/bin/env python
# -*- coding: iso-8859-15 -*-
#
# Syntax: convert_html2gml [[HTML_URL] OUTPUT_FILE]
#
# reasonable defaults are used, if zero or one arguments are given
#
# BEWARE: you _MUST_ adapt both the input parser "EventParser") and the output
# formatting ("get_gml_from_data") to your specific implementation
#
# Copyright:	2010 by Lars Kruse <devel@sumpfralle.de>
# License:	GNU GPL v3 or higher (http://www.gnu.org/licenses/gpl-3.0.txt)
#

"""
Popup:
	 * aktuelle Veranstaltung:
		* Veranstaltungsname
		* Termin
		* Ort ("Frieda 23")
		* "Details" -> Link
		* evt.: Liste weiterer Veranstaltungen am selben Ort (naechste Tage)
			* Datum und Veranstaltungsname als gemeinsamer Link fuer "mehr"
"""


import sys
import HTMLParser
import urllib
import htmlentitydefs
import time
import re
import locale


IMPORT_URL = "http://stadtgestalten.org/?q=termin-roh"
EXPORT_FILE = "events.gml"
ICON_URL_PREFIX = "http://stadtgestalten.org/event_map/icons/png"
INPUT_ENCODING = 'utf-8'
DATE_ENCODING = 'iso8859-15'
# number of additional events to be displayed for the same location
MAX_EXTRA_EVENTS = 3


# as given to "locale.setlocale"
LOCALE = "de_DE"

COLUMNS = {
		'title': 0,
		'time': 1,
		'category': 2,
		'place': 3,
		'latitude': 4,
		'longitude': 5,
		'organizer': 6,
		}


# mapping of categories (as defined in drupal) to filename prefixes
ICONS = {
		'Sonstiges': 'misc',
		'Party': 'party',
		'Lesung': 'lesung',
		'Demo': 'demo',
		'spontan': 'spontan',
		'Essen': 'essen',
		'Vortrag': 'vortrag',
		'Musik': 'musik',
		'Film': 'film',
		'Kunst': 'kunst',
		'Kinder': 'kinder',
		'Gruppentreffen': 'gruppentreffen',
		'default': 'misc',
		}


# mapping of time "importance" (today, tomorrow, ...) to sizes
ICON_SIZES = {
		'tiny': 12,
		'small': 24,
		'medium': 32,
		'big': 40,
		}


HTML_ESCAPE_TABLE = {
		"&": "&amp;",
		'"': "&quot;",
		"'": "&apos;",
		">": "&gt;",
		"<": "&lt;",
		}


""" how to display different events according to their date
events before "today" and after "future" are ignored
"""
TIME_OFFSET_HOURS = {
	"today":	-8,
	"soon":		1.5 * 24,
	"coming":	4 * 24,
	"future":	30 * 24,
	}


def html_escape(s):
	result = []
        for char in s:
		if char in HTML_ESCAPE_TABLE:
			result.append(HTML_ESCAPE_TABLE[char])
		elif ord(char) > 127:
			result.append("&#%d;" % ord(char))
		else:
			result.append(char)
	return u"".join(result)


class EventParser(HTMLParser.HTMLParser, object):
	""" This parser extracts the input from the the event page generated by drupal.
	BEWARE: the drupal page MUST be configured as a table layout with the above order
	of columns (see 'COLUMNS')
	"""

	def __init__(self):
		super(EventParser, self).__init__()
		self.in_data = False
		self.div_view_depth = 0
		self.in_event = False
		self.in_attribute = False
		self.current_attribute = -1
		self.events = []

	def handle_starttag(self, tag, attrs):
		attrs_dict = dict(attrs)
		if (tag == "div"):
			if attrs_dict.has_key("class") and (attrs_dict["class"] == "view-content"):
				self.in_data = True
				self.div_view_depth = 0
			elif self.in_data:
				# count the levels of sub-divs
				self.div_view_depth += 1
		elif self.in_data and (tag == "tr"):
			# a new event item
			self.in_event = True
			self.current_attribute = -1
			self.events.append({})
		elif self.in_event and (tag == "td"):
			self.in_attribute = True
			self.current_attribute += 1
		elif self.in_attribute and (tag == "a"):
			event = self.events[-1]
			if self.current_attribute == COLUMNS['title']:
				event['event_url'] = attrs_dict["href"]
			elif self.current_attribute == COLUMNS['place']:
				event['place_url'] = attrs_dict["href"]
			elif self.current_attribute == COLUMNS['organizer']:
				event['organizer_url'] = attrs_dict["href"]

	def handle_endtag(self, tag):
		if self.in_data:
			if (tag == "div"):
				if self.div_view_depth > 0:
					self.div_view_depth -= 1
				elif self.div_view_depth == 0:
					self.in_event = False
				else:
					# event_depth is never below zero
					pass
			elif (tag == "tr"):
				self.in_event = False
				# remove empty entries
				if not self.events[-1]:
					del self.events[-1]
			elif (tag == "td"):
				self.in_attribute = False
			else:
				pass

	def handle_data(self, data):
		""" BEWARE: the "handle_data" function does not really work well
		for strings, that contain an ampersand entity (e.g. &amp;).
		In this case, it splits the string before and after the special
		character. This results (e.g.) in a splitted title string and
		thus only the last part of it is correctly stored.
		Example: "foo &amp; bar" -> "bar"
		Solution: the strings, that may contain ampersand entities/refs
		(title, place, organizer) are joined with potential special
		characters.
		See "handle_charref" and "handle_entityref" below
		"""
		if self.in_attribute:
			data = data.decode(INPUT_ENCODING)
			event = self.events[-1]
			if not data.strip():
				# ignore empty strings
				pass
			elif self.current_attribute == COLUMNS['title']:
				# maybe the title is splitted by an ampersand entity
				event['title'] = event.get('title', '') + html_escape(data)
			elif self.current_attribute == COLUMNS['time']:
				if event.has_key("time"):
					# the first "time" field is the start
					# the next ones should be ignored
					pass
				else:
					timestamp = re.match("[0-9]*", data).group().strip()
					if timestamp:
						# non-empty string
						try:
							event['time'] = int(timestamp)
						except ValueError:
							# give a warning - the event will be ignored during the check later
							system.stderr.write("Invalid time format: %s\n" % (timestamp, ))
					else:
						# empty string - this event will be ignore during the check later
						pass
			elif self.current_attribute == COLUMNS['category']:
				event['category'] = data.strip()
			elif self.current_attribute == COLUMNS['place']:
				event['place'] = event.get('place', '') + html_escape(data)
			elif self.current_attribute == COLUMNS['latitude']:
				event['latitude'] = data.strip()
			elif self.current_attribute == COLUMNS['longitude']:
				event['longitude'] = data.strip()
			elif self.current_attribute == COLUMNS['organizer']:
				event['organizer'] = event.get('organizer', '') + html_escape(data)
			else:
				sys.stderr.write("UNKNOWN ATTRIBUTE: %d (%s)\n" % (self.current_attribute, data.encode(INPUT_ENCODING)))


	def handle_charref(self, num):
		""" add the encoded character (e.g. &#20;) to the current string """
		# decode before passing it to "handle_data"
		self.handle_data(unichr(int(num)))

	def handle_entityref(self, name):
		""" add the encoded entity (e.g. &amp;) to the current string """
		# decode before passing it to "handle_data"
		self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))


def read_from_url(url):
	try:
		con = urllib.urlopen(url)
	except IOError, errmsg:
		sys.stderr.write("Failed to open input (%s): %s\n" % (url, errmsg))
		sys.exit(1)
	try:
		data = con.read()
	except IOError, errmsg:
		sys.stderr.write("Failed to read from input (%s): %s\n" % (url, errmsg))
		sys.exit(2)
	con.close()
	return data


def get_date_string(timestamp):
	# use locale encoding
	try:
		locale.setlocale(locale.LC_ALL, LOCALE)
	except locale.Error, err_msg:
		sys.stderr.write("Locales (%s) not found: %s\n" % (LOCALE, err_msg) \
			+ "  Maybe you should run 'aptitude install locales-all' on debian.\n")
	localtime = time.localtime(timestamp)
	return html_escape(time.strftime(locale.nl_langinfo(locale.D_T_FMT), localtime))


def get_data_from_html(html):
	parser = EventParser()
	# parse the html input
	parser.feed(html)
	return filtered_events(parser.events)


def filtered_events(events):
	""" remove old or incomplete events """
	result = []
	for event in events:
		# add an empty 'category', if it is not set
		if not event.has_key('category'):
			event['category'] = None
		if not event.has_key('title'):
			sys.stderr.write("Skipping event without a title\n");
		elif not (event.has_key('longitude') and event.has_key('latitude')):
			# no error messages -> too many mails sent ...
			#sys.stderr.write(("Skipping event without long/lat: %(title)s\n" \
			#		% event).encode(INPUT_ENCODING))
			pass
		elif not event.has_key('time'):
			sys.stderr.write(("Skipping event without date/time: %(title)s\n" \
					% event).encode(INPUT_ENCODING))
		elif not is_event_current(event["time"]):
			# skipping events, that are too old or too far in the future
			pass
		else:
			# the event is valid
			result.append(event)
	return result


# icon sizes depend on the current date and the date of the event
# use the timestamp "None" for the smallest icon
def get_icon_size(timestamp):
	now = time.time()
	if timestamp is None:
		return ICON_SIZES["tiny"]
	if timestamp >= now + 3600 * TIME_OFFSET_HOURS["coming"]:
		return ICON_SIZES["small"]
	elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["soon"]:
		return ICON_SIZES["medium"]
	else:
		return ICON_SIZES["big"]


def is_event_current(timestamp):
	now = time.time()
	if timestamp < now + 3600 * TIME_OFFSET_HOURS["today"]:
		return False
	elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["future"]:
		return False
	else:
		return True


def get_icon_url(category, timestamp):
	if category in ICONS.keys():
		filename = ICONS[category]
	else:
		filename = ICONS["default"]
	size = get_icon_size(timestamp)
	return "%s/%s_%d.png" % (ICON_URL_PREFIX, filename, size)


def get_gml_from_data(data):
	result = []
	# first line: the header for gml
	result.append(['point', 'title', 'description', 'icon', 'iconSize', 'iconOffset'])
	# add one line for each event
	for place in group_sorted_events_by_location(data):
		result.append([])
		event = place[0]
		items = result[-1]
		# the 'point'
		items.append('%(latitude)s,%(longitude)s' % event)
		# the 'title'
		items.append('<a href="%(event_url)s" title="Details" target="_blank">%(title)s</a>' % event)
		# the 'description'
		description = '<ul>'
		# maybe only a date without time is given
		description += "<li>Termin: %s</li>" % get_date_string(event["time"])
		description += '<li>Ort: <a href="%(place_url)s" title="Ortsbeschreibung" target="_blank">%(place)s</a></li>' % event
		if event.has_key('organizer'):
			description += '<li>Veranstalter: <a href="%(organizer_url)s" title="Veranstalterinfos" target="_blank">%(organizer)s</a></li>' % event
		description += '<li><a href="%(event_url)s" title="%(title)s" target="_blank">... mehr Infos</a></li>' % event
		if len(place) > 1:
			# other events will take place here ...
			description += '<li>Weitere Veranstaltungen:<ul>'
			# we add not more than three more events
			num_events = min(MAX_EXTRA_EVENTS, len(place)-1)
			for index in range(1, num_events+1):
				other_event = place[index]
				# show a small icon for each other event
				description += '<li style="list-style-image:url(%s)">' \
						% get_icon_url(other_event["category"], None)
				description += '%s: <a href="%s" title="Details" target="_blank">%s</a></li>' \
						% (get_date_string(other_event["time"]).decode(DATE_ENCODING),
						 other_event["event_url"], other_event["title"])
			description += '</ul></li>'
		description += '</ul>'
		items.append(description)
		# the 'icon'
		items.append(get_icon_url(event['category'], event['time']))
		# 'iconSize' and 'iconOffset'
		size = get_icon_size(event['time'])
		# size of the icon
		items.append('%d,%d' % (size, size))
		# offset of the middle of the icon
		items.append('%d,%d' % (0, -size))
	# turn the array into a string
	return '\n'.join(['\t'.join(event) for event in result])


def unicode2htmlentities(text):
	result = ""
	for c in text:
		if ord(c) < 128:
			result += c
		else:
			result += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
	return result


def write_to_file(output_file, data):
	try:
		con = open(output_file, 'w')
	except IOError, errmsg:
		sys.stderr.write("Failed to open output file for writing (%s): %s" % (output_file, errmsg))
		sys.stderr.exit(3)
	try:
		con.write(unicode2htmlentities(data))
		# a final linebreak is necessary - otherwise openlayers ignores the last line
		con.write("\n")
	except IOError, errmsg:
		sys.stderr.write("Failed to write to output file(%s): %s" % (output_file, errmsg))
		sys.stderr.exit(4)
	con.close()


def is_same_location(event_a, event_b):
	""" check if two events are located at the same place
	beware: for now it does not take the name of the place into account.
	It only compares latitude and longitude.
	"""
	if (event_a["longitude"] == event_b["longitude"]) \
			and (event_a["latitude"] == event_b["latitude"]):
		return True
	else:
		return False


def group_sorted_events_by_location(events):
	""" convert a list of events into a list of places - each being a list
	of events happening at this location.
	"""
	places = []
	for event in events:
		# we use this variable to store a (possibly) matching place
		already_defined_place = None
		for place in places:
			if is_same_location(event, place[0]):
				already_defined_place = place
		if already_defined_place is None:
			# create a new place (starting with the current event)
			places.append([event])
		else:
			# add it to the existing place
			already_defined_place.append(event)
	# sort the events of each place by time
	def cmp_event(x, y):
		if x["time"] < y["time"]:
			return -1
		elif x["time"] == y["time"]:
			return 0
		else:
			return 1
	for place in places:
		place.sort(cmp=cmp_event)
	return places


if __name__ == '__main__':
	if len(sys.argv) > 1:
		input_file = sys.argv[1]
	else:
		input_file = IMPORT_URL
	if len(sys.argv) > 2:
		output_file = sys.argv[2]
	else:
		output_file = EXPORT_FILE
	html_data = read_from_url(input_file)
	data = get_data_from_html(html_data)
	gml_data = get_gml_from_data(data)
	write_to_file(output_file, gml_data)