codekasten/osm/dynamic_markers/scripts/convert_raw2gml.py

466 lines
14 KiB
Python
Raw Permalink Normal View History

#!/usr/bin/env python
# -*- coding: iso-8859-15 -*-
#
# Syntax: convert_html2gml [[HTML_URL] OUTPUT_FILE]
#
# reasonable defaults are used, if zero or one arguments are given
#
# BEWARE: you _MUST_ adapt both the input parser "EventParser") and the output
# formatting ("get_gml_from_data") to your specific implementation
#
# Copyright: 2010 by Lars Kruse <devel@sumpfralle.de>
# License: GNU GPL v3 or higher (http://www.gnu.org/licenses/gpl-3.0.txt)
#
"""
Popup:
* aktuelle Veranstaltung:
* Veranstaltungsname
* Termin
* Ort ("Frieda 23")
* "Details" -> Link
* evt.: Liste weiterer Veranstaltungen am selben Ort (naechste Tage)
* Datum und Veranstaltungsname als gemeinsamer Link fuer "mehr"
"""
import sys
import HTMLParser
import urllib
import htmlentitydefs
import time
import re
import locale
2011-03-08 23:01:10 +01:00
IMPORT_URL = "http://stadtgestalten.org/?q=termin-roh"
EXPORT_FILE = "events.gml"
ICON_URL_PREFIX = "http://stadtgestalten.org/event_map/icons/png"
INPUT_ENCODING = 'utf-8'
2011-03-08 23:01:10 +01:00
DATE_ENCODING = 'iso8859-15'
# number of additional events to be displayed for the same location
MAX_EXTRA_EVENTS = 3
# as given to "locale.setlocale"
LOCALE = "de_DE"
COLUMNS = {
'title': 0,
'time': 1,
'category': 2,
'place': 3,
'latitude': 4,
'longitude': 5,
'organizer': 6,
}
# mapping of categories (as defined in drupal) to filename prefixes
ICONS = {
'Sonstiges': 'misc',
'Party': 'party',
'Lesung': 'lesung',
'Demo': 'demo',
'spontan': 'spontan',
'Essen': 'essen',
'Vortrag': 'vortrag',
'Musik': 'musik',
'Film': 'film',
'Kunst': 'kunst',
'Kinder': 'kinder',
'Gruppentreffen': 'gruppentreffen',
'default': 'misc',
}
# mapping of time "importance" (today, tomorrow, ...) to sizes
ICON_SIZES = {
'tiny': 12,
'small': 24,
'medium': 32,
'big': 40,
}
HTML_ESCAPE_TABLE = {
"&": "&amp;",
'"': "&quot;",
"'": "&apos;",
">": "&gt;",
"<": "&lt;",
}
""" how to display different events according to their date
events before "today" and after "future" are ignored
"""
TIME_OFFSET_HOURS = {
"today": -8,
"soon": 1.5 * 24,
"coming": 4 * 24,
"future": 30 * 24,
}
def html_escape(s):
2011-03-08 23:01:10 +01:00
result = []
for char in s:
if char in HTML_ESCAPE_TABLE:
result.append(HTML_ESCAPE_TABLE[char])
elif ord(char) > 127:
2011-03-08 23:01:10 +01:00
result.append("&#%d;" % ord(char))
else:
result.append(char)
return u"".join(result)
class EventParser(HTMLParser.HTMLParser, object):
""" This parser extracts the input from the the event page generated by drupal.
BEWARE: the drupal page MUST be configured as a table layout with the above order
of columns (see 'COLUMNS')
"""
def __init__(self):
super(EventParser, self).__init__()
self.in_data = False
self.div_view_depth = 0
self.in_event = False
self.in_attribute = False
self.current_attribute = -1
self.events = []
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if (tag == "div"):
if attrs_dict.has_key("class") and (attrs_dict["class"] == "view-content"):
self.in_data = True
self.div_view_depth = 0
elif self.in_data:
# count the levels of sub-divs
self.div_view_depth += 1
elif self.in_data and (tag == "tr"):
# a new event item
self.in_event = True
self.current_attribute = -1
self.events.append({})
elif self.in_event and (tag == "td"):
self.in_attribute = True
self.current_attribute += 1
elif self.in_attribute and (tag == "a"):
event = self.events[-1]
if self.current_attribute == COLUMNS['title']:
event['event_url'] = attrs_dict["href"]
elif self.current_attribute == COLUMNS['place']:
event['place_url'] = attrs_dict["href"]
elif self.current_attribute == COLUMNS['organizer']:
event['organizer_url'] = attrs_dict["href"]
def handle_endtag(self, tag):
if self.in_data:
if (tag == "div"):
if self.div_view_depth > 0:
self.div_view_depth -= 1
elif self.div_view_depth == 0:
self.in_event = False
else:
# event_depth is never below zero
pass
elif (tag == "tr"):
self.in_event = False
# remove empty entries
if not self.events[-1]:
del self.events[-1]
elif (tag == "td"):
self.in_attribute = False
else:
pass
def handle_data(self, data):
""" BEWARE: the "handle_data" function does not really work well
for strings, that contain an ampersand entity (e.g. &amp;).
In this case, it splits the string before and after the special
character. This results (e.g.) in a splitted title string and
thus only the last part of it is correctly stored.
Example: "foo &amp; bar" -> "bar"
Solution: the strings, that may contain ampersand entities/refs
(title, place, organizer) are joined with potential special
characters.
See "handle_charref" and "handle_entityref" below
"""
if self.in_attribute:
event = self.events[-1]
if not data.strip():
# ignore empty strings
pass
elif self.current_attribute == COLUMNS['title']:
# maybe the title is splitted by an ampersand entity
event['title'] = event.get('title', '') + html_escape(data)
elif self.current_attribute == COLUMNS['time']:
if event.has_key("time"):
# the first "time" field is the start
# the next ones should be ignored
pass
else:
timestamp = re.match("[0-9]*", data).group().strip()
if timestamp:
# non-empty string
try:
event['time'] = int(timestamp)
except ValueError:
# give a warning - the event will be ignored during the check later
system.stderr.write("Invalid time format: %s\n" % (timestamp, ))
else:
# empty string - this event will be ignore during the check later
pass
elif self.current_attribute == COLUMNS['category']:
event['category'] = data.strip()
elif self.current_attribute == COLUMNS['place']:
event['place'] = event.get('place', '') + html_escape(data)
elif self.current_attribute == COLUMNS['latitude']:
event['latitude'] = data.strip()
elif self.current_attribute == COLUMNS['longitude']:
event['longitude'] = data.strip()
elif self.current_attribute == COLUMNS['organizer']:
event['organizer'] = event.get('organizer', '') + html_escape(data)
else:
sys.stderr.write("UNKNOWN ATTRIBUTE: %d (%s)\n" % (self.current_attribute, data.encode(INPUT_ENCODING)))
def handle_charref(self, num):
""" add the encoded character (e.g. &#20;) to the current string """
# decode before passing it to "handle_data"
self.handle_data(unichr(int(num)))
def handle_entityref(self, name):
""" add the encoded entity (e.g. &amp;) to the current string """
# decode before passing it to "handle_data"
self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
def read_from_url(url):
try:
con = urllib.urlopen(url)
except IOError, errmsg:
sys.stderr.write("Failed to open input (%s): %s\n" % (url, errmsg))
sys.exit(1)
try:
data = con.read()
except IOError, errmsg:
sys.stderr.write("Failed to read from input (%s): %s\n" % (url, errmsg))
sys.exit(2)
encoding = con.headers.getparam('charset')
data = data.decode(encoding)
con.close()
return data
def get_date_string(timestamp):
# use locale encoding
try:
locale.setlocale(locale.LC_ALL, LOCALE)
except locale.Error, err_msg:
sys.stderr.write("Locales (%s) not found: %s\n" % (LOCALE, err_msg) \
+ " Maybe you should run 'aptitude install locales-all' on debian.\n")
localtime = time.localtime(timestamp)
return html_escape(time.strftime(locale.nl_langinfo(locale.D_T_FMT), localtime))
def get_data_from_html(html):
parser = EventParser()
# parse the html input
parser.feed(html)
return filtered_events(parser.events)
def filtered_events(events):
""" remove old or incomplete events """
result = []
for event in events:
# add an empty 'category', if it is not set
if not event.has_key('category'):
event['category'] = None
if not event.has_key('title'):
sys.stderr.write("Skipping event without a title\n");
elif not (event.has_key('longitude') and event.has_key('latitude')):
2011-03-08 23:01:10 +01:00
# no error messages -> too many mails sent ...
#sys.stderr.write(("Skipping event without long/lat: %(title)s\n" \
# % event).encode(INPUT_ENCODING))
pass
elif not event.has_key('time'):
sys.stderr.write(("Skipping event without date/time: %(title)s\n" \
% event).encode(INPUT_ENCODING))
elif not is_event_current(event["time"]):
# skipping events, that are too old or too far in the future
pass
else:
# the event is valid
result.append(event)
return result
# icon sizes depend on the current date and the date of the event
# use the timestamp "None" for the smallest icon
def get_icon_size(timestamp):
now = time.time()
if timestamp is None:
return ICON_SIZES["tiny"]
if timestamp >= now + 3600 * TIME_OFFSET_HOURS["coming"]:
return ICON_SIZES["small"]
elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["soon"]:
return ICON_SIZES["medium"]
else:
return ICON_SIZES["big"]
def is_event_current(timestamp):
now = time.time()
if timestamp < now + 3600 * TIME_OFFSET_HOURS["today"]:
return False
elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["future"]:
return False
else:
return True
def get_icon_url(category, timestamp):
if category in ICONS.keys():
filename = ICONS[category]
else:
filename = ICONS["default"]
size = get_icon_size(timestamp)
return "%s/%s_%d.png" % (ICON_URL_PREFIX, filename, size)
def get_gml_from_data(data):
result = []
# first line: the header for gml
result.append(['point', 'title', 'description', 'icon', 'iconSize', 'iconOffset'])
# add one line for each event
for place in group_sorted_events_by_location(data):
result.append([])
event = place[0]
items = result[-1]
# the 'point'
items.append('%(latitude)s,%(longitude)s' % event)
# the 'title'
items.append('<a href="%(event_url)s" title="Details" target="_blank">%(title)s</a>' % event)
# the 'description'
description = '<ul>'
# maybe only a date without time is given
description += "<li>Termin: %s</li>" % get_date_string(event["time"])
description += '<li>Ort: <a href="%(place_url)s" title="Ortsbeschreibung" target="_blank">%(place)s</a></li>' % event
if event.has_key('organizer'):
description += '<li>Veranstalter: <a href="%(organizer_url)s" title="Veranstalterinfos" target="_blank">%(organizer)s</a></li>' % event
description += '<li><a href="%(event_url)s" title="%(title)s" target="_blank">... mehr Infos</a></li>' % event
if len(place) > 1:
# other events will take place here ...
description += '<li>Weitere Veranstaltungen:<ul>'
# we add not more than three more events
num_events = min(MAX_EXTRA_EVENTS, len(place)-1)
for index in range(1, num_events+1):
other_event = place[index]
# show a small icon for each other event
description += '<li style="list-style-image:url(%s)">' \
% get_icon_url(other_event["category"], None)
description += '%s: <a href="%s" title="Details" target="_blank">%s</a></li>' \
% (get_date_string(other_event["time"]).decode(DATE_ENCODING),
other_event["event_url"], other_event["title"])
description += '</ul></li>'
description += '</ul>'
items.append(description)
# the 'icon'
items.append(get_icon_url(event['category'], event['time']))
# 'iconSize' and 'iconOffset'
size = get_icon_size(event['time'])
# size of the icon
items.append('%d,%d' % (size, size))
# offset of the middle of the icon
2011-03-08 23:01:10 +01:00
items.append('%d,%d' % (0, -size))
# turn the array into a string
return '\n'.join(['\t'.join(event) for event in result])
def unicode2htmlentities(text):
result = ""
for c in text:
if ord(c) < 128:
result += c
else:
result += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
return result
def write_to_file(output_file, data):
try:
con = open(output_file, 'w')
except IOError, errmsg:
sys.stderr.write("Failed to open output file for writing (%s): %s" % (output_file, errmsg))
sys.stderr.exit(3)
try:
con.write(unicode2htmlentities(data))
# a final linebreak is necessary - otherwise openlayers ignores the last line
con.write("\n")
except IOError, errmsg:
sys.stderr.write("Failed to write to output file(%s): %s" % (output_file, errmsg))
sys.stderr.exit(4)
con.close()
def is_same_location(event_a, event_b):
""" check if two events are located at the same place
beware: for now it does not take the name of the place into account.
It only compares latitude and longitude.
"""
if (event_a["longitude"] == event_b["longitude"]) \
and (event_a["latitude"] == event_b["latitude"]):
return True
else:
return False
def group_sorted_events_by_location(events):
""" convert a list of events into a list of places - each being a list
of events happening at this location.
"""
places = []
for event in events:
# we use this variable to store a (possibly) matching place
already_defined_place = None
for place in places:
if is_same_location(event, place[0]):
already_defined_place = place
if already_defined_place is None:
# create a new place (starting with the current event)
places.append([event])
else:
# add it to the existing place
already_defined_place.append(event)
# sort the events of each place by time
def cmp_event(x, y):
if x["time"] < y["time"]:
return -1
elif x["time"] == y["time"]:
return 0
else:
return 1
for place in places:
place.sort(cmp=cmp_event)
return places
if __name__ == '__main__':
if len(sys.argv) > 1:
input_file = sys.argv[1]
else:
input_file = IMPORT_URL
if len(sys.argv) > 2:
output_file = sys.argv[2]
else:
output_file = EXPORT_FILE
html_data = read_from_url(input_file)
data = get_data_from_html(html_data)
gml_data = get_gml_from_data(data)
write_to_file(output_file, gml_data)