464 lines
13 KiB
Python
Executable file
464 lines
13 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# -*- coding: iso-8859-15 -*-
|
|
#
|
|
# Syntax: convert_html2gml [[HTML_URL] OUTPUT_FILE]
|
|
#
|
|
# reasonable defaults are used, if zero or one arguments are given
|
|
#
|
|
# BEWARE: you _MUST_ adapt both the input parser "EventParser") and the output
|
|
# formatting ("get_gml_from_data") to your specific implementation
|
|
#
|
|
# Copyright: 2010 by Lars Kruse <devel@sumpfralle.de>
|
|
# License: GNU GPL v3 or higher (http://www.gnu.org/licenses/gpl-3.0.txt)
|
|
#
|
|
|
|
"""
|
|
Popup:
|
|
* aktuelle Veranstaltung:
|
|
* Veranstaltungsname
|
|
* Termin
|
|
* Ort ("Frieda 23")
|
|
* "Details" -> Link
|
|
* evt.: Liste weiterer Veranstaltungen am selben Ort (naechste Tage)
|
|
* Datum und Veranstaltungsname als gemeinsamer Link fuer "mehr"
|
|
"""
|
|
|
|
|
|
import sys
|
|
import HTMLParser
|
|
import urllib
|
|
import htmlentitydefs
|
|
import time
|
|
import re
|
|
import locale
|
|
|
|
|
|
IMPORT_URL = "http://stadtgestalten.org/?q=termin-roh"
|
|
EXPORT_FILE = "events.gml"
|
|
ICON_URL_PREFIX = "http://stadtgestalten.org/event_map/icons/png"
|
|
INPUT_ENCODING = 'utf-8'
|
|
DATE_ENCODING = 'iso8859-15'
|
|
# number of additional events to be displayed for the same location
|
|
MAX_EXTRA_EVENTS = 3
|
|
|
|
|
|
# as given to "locale.setlocale"
|
|
LOCALE = "de_DE"
|
|
|
|
COLUMNS = {
|
|
'title': 0,
|
|
'time': 1,
|
|
'category': 2,
|
|
'place': 3,
|
|
'latitude': 4,
|
|
'longitude': 5,
|
|
'organizer': 6,
|
|
}
|
|
|
|
|
|
# mapping of categories (as defined in drupal) to filename prefixes
|
|
ICONS = {
|
|
'Sonstiges': 'misc',
|
|
'Party': 'party',
|
|
'Lesung': 'lesung',
|
|
'Demo': 'demo',
|
|
'spontan': 'spontan',
|
|
'Essen': 'essen',
|
|
'Vortrag': 'vortrag',
|
|
'Musik': 'musik',
|
|
'Film': 'film',
|
|
'Kunst': 'kunst',
|
|
'Kinder': 'kinder',
|
|
'Gruppentreffen': 'gruppentreffen',
|
|
'default': 'misc',
|
|
}
|
|
|
|
|
|
# mapping of time "importance" (today, tomorrow, ...) to sizes
|
|
ICON_SIZES = {
|
|
'tiny': 12,
|
|
'small': 24,
|
|
'medium': 32,
|
|
'big': 40,
|
|
}
|
|
|
|
|
|
HTML_ESCAPE_TABLE = {
|
|
"&": "&",
|
|
'"': """,
|
|
"'": "'",
|
|
">": ">",
|
|
"<": "<",
|
|
}
|
|
|
|
|
|
""" how to display different events according to their date
|
|
events before "today" and after "future" are ignored
|
|
"""
|
|
TIME_OFFSET_HOURS = {
|
|
"today": -8,
|
|
"soon": 1.5 * 24,
|
|
"coming": 4 * 24,
|
|
"future": 30 * 24,
|
|
}
|
|
|
|
|
|
def html_escape(s):
|
|
result = []
|
|
for char in s:
|
|
if char in HTML_ESCAPE_TABLE:
|
|
result.append(HTML_ESCAPE_TABLE[char])
|
|
elif ord(char) > 127:
|
|
result.append("&#%d;" % ord(char))
|
|
else:
|
|
result.append(char)
|
|
return u"".join(result)
|
|
|
|
|
|
class EventParser(HTMLParser.HTMLParser, object):
|
|
""" This parser extracts the input from the the event page generated by drupal.
|
|
BEWARE: the drupal page MUST be configured as a table layout with the above order
|
|
of columns (see 'COLUMNS')
|
|
"""
|
|
|
|
def __init__(self):
|
|
super(EventParser, self).__init__()
|
|
self.in_data = False
|
|
self.div_view_depth = 0
|
|
self.in_event = False
|
|
self.in_attribute = False
|
|
self.current_attribute = -1
|
|
self.events = []
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
attrs_dict = dict(attrs)
|
|
if (tag == "div"):
|
|
if attrs_dict.has_key("class") and (attrs_dict["class"] == "view-content"):
|
|
self.in_data = True
|
|
self.div_view_depth = 0
|
|
elif self.in_data:
|
|
# count the levels of sub-divs
|
|
self.div_view_depth += 1
|
|
elif self.in_data and (tag == "tr"):
|
|
# a new event item
|
|
self.in_event = True
|
|
self.current_attribute = -1
|
|
self.events.append({})
|
|
elif self.in_event and (tag == "td"):
|
|
self.in_attribute = True
|
|
self.current_attribute += 1
|
|
elif self.in_attribute and (tag == "a"):
|
|
event = self.events[-1]
|
|
if self.current_attribute == COLUMNS['title']:
|
|
event['event_url'] = attrs_dict["href"]
|
|
elif self.current_attribute == COLUMNS['place']:
|
|
event['place_url'] = attrs_dict["href"]
|
|
elif self.current_attribute == COLUMNS['organizer']:
|
|
event['organizer_url'] = attrs_dict["href"]
|
|
|
|
def handle_endtag(self, tag):
|
|
if self.in_data:
|
|
if (tag == "div"):
|
|
if self.div_view_depth > 0:
|
|
self.div_view_depth -= 1
|
|
elif self.div_view_depth == 0:
|
|
self.in_event = False
|
|
else:
|
|
# event_depth is never below zero
|
|
pass
|
|
elif (tag == "tr"):
|
|
self.in_event = False
|
|
# remove empty entries
|
|
if not self.events[-1]:
|
|
del self.events[-1]
|
|
elif (tag == "td"):
|
|
self.in_attribute = False
|
|
else:
|
|
pass
|
|
|
|
def handle_data(self, data):
|
|
""" BEWARE: the "handle_data" function does not really work well
|
|
for strings, that contain an ampersand entity (e.g. &).
|
|
In this case, it splits the string before and after the special
|
|
character. This results (e.g.) in a splitted title string and
|
|
thus only the last part of it is correctly stored.
|
|
Example: "foo & bar" -> "bar"
|
|
Solution: the strings, that may contain ampersand entities/refs
|
|
(title, place, organizer) are joined with potential special
|
|
characters.
|
|
See "handle_charref" and "handle_entityref" below
|
|
"""
|
|
if self.in_attribute:
|
|
data = data.decode(INPUT_ENCODING)
|
|
event = self.events[-1]
|
|
if not data.strip():
|
|
# ignore empty strings
|
|
pass
|
|
elif self.current_attribute == COLUMNS['title']:
|
|
# maybe the title is splitted by an ampersand entity
|
|
event['title'] = event.get('title', '') + html_escape(data)
|
|
elif self.current_attribute == COLUMNS['time']:
|
|
if event.has_key("time"):
|
|
# the first "time" field is the start
|
|
# the next ones should be ignored
|
|
pass
|
|
else:
|
|
timestamp = re.match("[0-9]*", data).group().strip()
|
|
if timestamp:
|
|
# non-empty string
|
|
try:
|
|
event['time'] = int(timestamp)
|
|
except ValueError:
|
|
# give a warning - the event will be ignored during the check later
|
|
system.stderr.write("Invalid time format: %s\n" % (timestamp, ))
|
|
else:
|
|
# empty string - this event will be ignore during the check later
|
|
pass
|
|
elif self.current_attribute == COLUMNS['category']:
|
|
event['category'] = data.strip()
|
|
elif self.current_attribute == COLUMNS['place']:
|
|
event['place'] = event.get('place', '') + html_escape(data)
|
|
elif self.current_attribute == COLUMNS['latitude']:
|
|
event['latitude'] = data.strip()
|
|
elif self.current_attribute == COLUMNS['longitude']:
|
|
event['longitude'] = data.strip()
|
|
elif self.current_attribute == COLUMNS['organizer']:
|
|
event['organizer'] = event.get('organizer', '') + html_escape(data)
|
|
else:
|
|
sys.stderr.write("UNKNOWN ATTRIBUTE: %d (%s)\n" % (self.current_attribute, data.encode(INPUT_ENCODING)))
|
|
|
|
|
|
def handle_charref(self, num):
|
|
""" add the encoded character (e.g. ) to the current string """
|
|
# decode before passing it to "handle_data"
|
|
self.handle_data(unichr(int(num)))
|
|
|
|
def handle_entityref(self, name):
|
|
""" add the encoded entity (e.g. &) to the current string """
|
|
# decode before passing it to "handle_data"
|
|
self.handle_data(unichr(htmlentitydefs.name2codepoint[name]))
|
|
|
|
|
|
def read_from_url(url):
|
|
try:
|
|
con = urllib.urlopen(url)
|
|
except IOError, errmsg:
|
|
sys.stderr.write("Failed to open input (%s): %s\n" % (url, errmsg))
|
|
sys.exit(1)
|
|
try:
|
|
data = con.read()
|
|
except IOError, errmsg:
|
|
sys.stderr.write("Failed to read from input (%s): %s\n" % (url, errmsg))
|
|
sys.exit(2)
|
|
con.close()
|
|
return data
|
|
|
|
|
|
def get_date_string(timestamp):
|
|
# use locale encoding
|
|
try:
|
|
locale.setlocale(locale.LC_ALL, LOCALE)
|
|
except locale.Error, err_msg:
|
|
sys.stderr.write("Locales (%s) not found: %s\n" % (LOCALE, err_msg) \
|
|
+ " Maybe you should run 'aptitude install locales-all' on debian.\n")
|
|
localtime = time.localtime(timestamp)
|
|
return html_escape(time.strftime(locale.nl_langinfo(locale.D_T_FMT), localtime))
|
|
|
|
|
|
def get_data_from_html(html):
|
|
parser = EventParser()
|
|
# parse the html input
|
|
parser.feed(html)
|
|
return filtered_events(parser.events)
|
|
|
|
|
|
def filtered_events(events):
|
|
""" remove old or incomplete events """
|
|
result = []
|
|
for event in events:
|
|
# add an empty 'category', if it is not set
|
|
if not event.has_key('category'):
|
|
event['category'] = None
|
|
if not event.has_key('title'):
|
|
sys.stderr.write("Skipping event without a title\n");
|
|
elif not (event.has_key('longitude') and event.has_key('latitude')):
|
|
# no error messages -> too many mails sent ...
|
|
#sys.stderr.write(("Skipping event without long/lat: %(title)s\n" \
|
|
# % event).encode(INPUT_ENCODING))
|
|
pass
|
|
elif not event.has_key('time'):
|
|
sys.stderr.write(("Skipping event without date/time: %(title)s\n" \
|
|
% event).encode(INPUT_ENCODING))
|
|
elif not is_event_current(event["time"]):
|
|
# skipping events, that are too old or too far in the future
|
|
pass
|
|
else:
|
|
# the event is valid
|
|
result.append(event)
|
|
return result
|
|
|
|
|
|
# icon sizes depend on the current date and the date of the event
|
|
# use the timestamp "None" for the smallest icon
|
|
def get_icon_size(timestamp):
|
|
now = time.time()
|
|
if timestamp is None:
|
|
return ICON_SIZES["tiny"]
|
|
if timestamp >= now + 3600 * TIME_OFFSET_HOURS["coming"]:
|
|
return ICON_SIZES["small"]
|
|
elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["soon"]:
|
|
return ICON_SIZES["medium"]
|
|
else:
|
|
return ICON_SIZES["big"]
|
|
|
|
|
|
def is_event_current(timestamp):
|
|
now = time.time()
|
|
if timestamp < now + 3600 * TIME_OFFSET_HOURS["today"]:
|
|
return False
|
|
elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["future"]:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
|
|
def get_icon_url(category, timestamp):
|
|
if category in ICONS.keys():
|
|
filename = ICONS[category]
|
|
else:
|
|
filename = ICONS["default"]
|
|
size = get_icon_size(timestamp)
|
|
return "%s/%s_%d.png" % (ICON_URL_PREFIX, filename, size)
|
|
|
|
|
|
def get_gml_from_data(data):
|
|
result = []
|
|
# first line: the header for gml
|
|
result.append(['point', 'title', 'description', 'icon', 'iconSize', 'iconOffset'])
|
|
# add one line for each event
|
|
for place in group_sorted_events_by_location(data):
|
|
result.append([])
|
|
event = place[0]
|
|
items = result[-1]
|
|
# the 'point'
|
|
items.append('%(latitude)s,%(longitude)s' % event)
|
|
# the 'title'
|
|
items.append('<a href="%(event_url)s" title="Details" target="_blank">%(title)s</a>' % event)
|
|
# the 'description'
|
|
description = '<ul>'
|
|
# maybe only a date without time is given
|
|
description += "<li>Termin: %s</li>" % get_date_string(event["time"])
|
|
description += '<li>Ort: <a href="%(place_url)s" title="Ortsbeschreibung" target="_blank">%(place)s</a></li>' % event
|
|
if event.has_key('organizer'):
|
|
description += '<li>Veranstalter: <a href="%(organizer_url)s" title="Veranstalterinfos" target="_blank">%(organizer)s</a></li>' % event
|
|
description += '<li><a href="%(event_url)s" title="%(title)s" target="_blank">... mehr Infos</a></li>' % event
|
|
if len(place) > 1:
|
|
# other events will take place here ...
|
|
description += '<li>Weitere Veranstaltungen:<ul>'
|
|
# we add not more than three more events
|
|
num_events = min(MAX_EXTRA_EVENTS, len(place)-1)
|
|
for index in range(1, num_events+1):
|
|
other_event = place[index]
|
|
# show a small icon for each other event
|
|
description += '<li style="list-style-image:url(%s)">' \
|
|
% get_icon_url(other_event["category"], None)
|
|
description += '%s: <a href="%s" title="Details" target="_blank">%s</a></li>' \
|
|
% (get_date_string(other_event["time"]).decode(DATE_ENCODING),
|
|
other_event["event_url"], other_event["title"])
|
|
description += '</ul></li>'
|
|
description += '</ul>'
|
|
items.append(description)
|
|
# the 'icon'
|
|
items.append(get_icon_url(event['category'], event['time']))
|
|
# 'iconSize' and 'iconOffset'
|
|
size = get_icon_size(event['time'])
|
|
# size of the icon
|
|
items.append('%d,%d' % (size, size))
|
|
# offset of the middle of the icon
|
|
items.append('%d,%d' % (0, -size))
|
|
# turn the array into a string
|
|
return '\n'.join(['\t'.join(event) for event in result])
|
|
|
|
|
|
def unicode2htmlentities(text):
|
|
result = ""
|
|
for c in text:
|
|
if ord(c) < 128:
|
|
result += c
|
|
else:
|
|
result += '&%s;' % htmlentitydefs.codepoint2name[ord(c)]
|
|
return result
|
|
|
|
|
|
def write_to_file(output_file, data):
|
|
try:
|
|
con = open(output_file, 'w')
|
|
except IOError, errmsg:
|
|
sys.stderr.write("Failed to open output file for writing (%s): %s" % (output_file, errmsg))
|
|
sys.stderr.exit(3)
|
|
try:
|
|
con.write(unicode2htmlentities(data))
|
|
# a final linebreak is necessary - otherwise openlayers ignores the last line
|
|
con.write("\n")
|
|
except IOError, errmsg:
|
|
sys.stderr.write("Failed to write to output file(%s): %s" % (output_file, errmsg))
|
|
sys.stderr.exit(4)
|
|
con.close()
|
|
|
|
|
|
def is_same_location(event_a, event_b):
|
|
""" check if two events are located at the same place
|
|
beware: for now it does not take the name of the place into account.
|
|
It only compares latitude and longitude.
|
|
"""
|
|
if (event_a["longitude"] == event_b["longitude"]) \
|
|
and (event_a["latitude"] == event_b["latitude"]):
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
|
|
def group_sorted_events_by_location(events):
|
|
""" convert a list of events into a list of places - each being a list
|
|
of events happening at this location.
|
|
"""
|
|
places = []
|
|
for event in events:
|
|
# we use this variable to store a (possibly) matching place
|
|
already_defined_place = None
|
|
for place in places:
|
|
if is_same_location(event, place[0]):
|
|
already_defined_place = place
|
|
if already_defined_place is None:
|
|
# create a new place (starting with the current event)
|
|
places.append([event])
|
|
else:
|
|
# add it to the existing place
|
|
already_defined_place.append(event)
|
|
# sort the events of each place by time
|
|
def cmp_event(x, y):
|
|
if x["time"] < y["time"]:
|
|
return -1
|
|
elif x["time"] == y["time"]:
|
|
return 0
|
|
else:
|
|
return 1
|
|
for place in places:
|
|
place.sort(cmp=cmp_event)
|
|
return places
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) > 1:
|
|
input_file = sys.argv[1]
|
|
else:
|
|
input_file = IMPORT_URL
|
|
if len(sys.argv) > 2:
|
|
output_file = sys.argv[2]
|
|
else:
|
|
output_file = EXPORT_FILE
|
|
html_data = read_from_url(input_file)
|
|
data = get_data_from_html(html_data)
|
|
gml_data = get_gml_from_data(data)
|
|
write_to_file(output_file, gml_data)
|
|
|