#!/usr/bin/env python # -*- coding: iso-8859-15 -*- # # Syntax: convert_html2gml [[HTML_URL] OUTPUT_FILE] # # reasonable defaults are used, if zero or one arguments are given # # BEWARE: you _MUST_ adapt both the input parser "EventParser") and the output # formatting ("get_gml_from_data") to your specific implementation # # Copyright: 2010 by Lars Kruse # License: GNU GPL v3 or higher (http://www.gnu.org/licenses/gpl-3.0.txt) # """ Popup: * aktuelle Veranstaltung: * Veranstaltungsname * Termin * Ort ("Frieda 23") * "Details" -> Link * evt.: Liste weiterer Veranstaltungen am selben Ort (naechste Tage) * Datum und Veranstaltungsname als gemeinsamer Link fuer "mehr" """ import sys import HTMLParser import urllib import htmlentitydefs import time import re import locale IMPORT_URL = "http://stadtgestalten.org/?q=termin-roh" EXPORT_FILE = "events.gml" ICON_URL_PREFIX = "http://stadtgestalten.org/event_map/icons/png" INPUT_ENCODING = 'utf-8' DATE_ENCODING = 'iso8859-15' # number of additional events to be displayed for the same location MAX_EXTRA_EVENTS = 3 # as given to "locale.setlocale" LOCALE = "de_DE" COLUMNS = { 'title': 0, 'time': 1, 'category': 2, 'place': 3, 'latitude': 4, 'longitude': 5, 'organizer': 6, } # mapping of categories (as defined in drupal) to filename prefixes ICONS = { 'Sonstiges': 'misc', 'Party': 'party', 'Lesung': 'lesung', 'Demo': 'demo', 'spontan': 'spontan', 'Essen': 'essen', 'Vortrag': 'vortrag', 'Musik': 'musik', 'Film': 'film', 'Kunst': 'kunst', 'Kinder': 'kinder', 'Gruppentreffen': 'gruppentreffen', 'default': 'misc', } # mapping of time "importance" (today, tomorrow, ...) to sizes ICON_SIZES = { 'tiny': 12, 'small': 24, 'medium': 32, 'big': 40, } HTML_ESCAPE_TABLE = { "&": "&", '"': """, "'": "'", ">": ">", "<": "<", } """ how to display different events according to their date events before "today" and after "future" are ignored """ TIME_OFFSET_HOURS = { "today": -8, "soon": 1.5 * 24, "coming": 4 * 24, "future": 30 * 24, } def html_escape(s): result = [] for char in s: if char in HTML_ESCAPE_TABLE: result.append(HTML_ESCAPE_TABLE[char]) elif ord(char) > 127: result.append("&#%d;" % ord(char)) else: result.append(char) return u"".join(result) class EventParser(HTMLParser.HTMLParser, object): """ This parser extracts the input from the the event page generated by drupal. BEWARE: the drupal page MUST be configured as a table layout with the above order of columns (see 'COLUMNS') """ def __init__(self): super(EventParser, self).__init__() self.in_data = False self.div_view_depth = 0 self.in_event = False self.in_attribute = False self.current_attribute = -1 self.events = [] def handle_starttag(self, tag, attrs): attrs_dict = dict(attrs) if (tag == "div"): if attrs_dict.has_key("class") and (attrs_dict["class"] == "view-content"): self.in_data = True self.div_view_depth = 0 elif self.in_data: # count the levels of sub-divs self.div_view_depth += 1 elif self.in_data and (tag == "tr"): # a new event item self.in_event = True self.current_attribute = -1 self.events.append({}) elif self.in_event and (tag == "td"): self.in_attribute = True self.current_attribute += 1 elif self.in_attribute and (tag == "a"): event = self.events[-1] if self.current_attribute == COLUMNS['title']: event['event_url'] = attrs_dict["href"] elif self.current_attribute == COLUMNS['place']: event['place_url'] = attrs_dict["href"] elif self.current_attribute == COLUMNS['organizer']: event['organizer_url'] = attrs_dict["href"] def handle_endtag(self, tag): if self.in_data: if (tag == "div"): if self.div_view_depth > 0: self.div_view_depth -= 1 elif self.div_view_depth == 0: self.in_event = False else: # event_depth is never below zero pass elif (tag == "tr"): self.in_event = False # remove empty entries if not self.events[-1]: del self.events[-1] elif (tag == "td"): self.in_attribute = False else: pass def handle_data(self, data): """ BEWARE: the "handle_data" function does not really work well for strings, that contain an ampersand entity (e.g. &). In this case, it splits the string before and after the special character. This results (e.g.) in a splitted title string and thus only the last part of it is correctly stored. Example: "foo & bar" -> "bar" Solution: the strings, that may contain ampersand entities/refs (title, place, organizer) are joined with potential special characters. See "handle_charref" and "handle_entityref" below """ if self.in_attribute: event = self.events[-1] if not data.strip(): # ignore empty strings pass elif self.current_attribute == COLUMNS['title']: # maybe the title is splitted by an ampersand entity event['title'] = event.get('title', '') + html_escape(data) elif self.current_attribute == COLUMNS['time']: if event.has_key("time"): # the first "time" field is the start # the next ones should be ignored pass else: timestamp = re.match("[0-9]*", data).group().strip() if timestamp: # non-empty string try: event['time'] = int(timestamp) except ValueError: # give a warning - the event will be ignored during the check later system.stderr.write("Invalid time format: %s\n" % (timestamp, )) else: # empty string - this event will be ignore during the check later pass elif self.current_attribute == COLUMNS['category']: event['category'] = data.strip() elif self.current_attribute == COLUMNS['place']: event['place'] = event.get('place', '') + html_escape(data) elif self.current_attribute == COLUMNS['latitude']: event['latitude'] = data.strip() elif self.current_attribute == COLUMNS['longitude']: event['longitude'] = data.strip() elif self.current_attribute == COLUMNS['organizer']: event['organizer'] = event.get('organizer', '') + html_escape(data) else: sys.stderr.write("UNKNOWN ATTRIBUTE: %d (%s)\n" % (self.current_attribute, data.encode(INPUT_ENCODING))) def handle_charref(self, num): """ add the encoded character (e.g. ) to the current string """ # decode before passing it to "handle_data" self.handle_data(unichr(int(num))) def handle_entityref(self, name): """ add the encoded entity (e.g. &) to the current string """ # decode before passing it to "handle_data" self.handle_data(unichr(htmlentitydefs.name2codepoint[name])) def read_from_url(url): try: con = urllib.urlopen(url) except IOError, errmsg: sys.stderr.write("Failed to open input (%s): %s\n" % (url, errmsg)) sys.exit(1) try: data = con.read() except IOError, errmsg: sys.stderr.write("Failed to read from input (%s): %s\n" % (url, errmsg)) sys.exit(2) encoding = con.headers.getparam('charset') data = data.decode(encoding) con.close() return data def get_date_string(timestamp): # use locale encoding try: locale.setlocale(locale.LC_ALL, LOCALE) except locale.Error, err_msg: sys.stderr.write("Locales (%s) not found: %s\n" % (LOCALE, err_msg) \ + " Maybe you should run 'aptitude install locales-all' on debian.\n") localtime = time.localtime(timestamp) return html_escape(time.strftime(locale.nl_langinfo(locale.D_T_FMT), localtime)) def get_data_from_html(html): parser = EventParser() # parse the html input parser.feed(html) return filtered_events(parser.events) def filtered_events(events): """ remove old or incomplete events """ result = [] for event in events: # add an empty 'category', if it is not set if not event.has_key('category'): event['category'] = None if not event.has_key('title'): sys.stderr.write("Skipping event without a title\n"); elif not (event.has_key('longitude') and event.has_key('latitude')): # no error messages -> too many mails sent ... #sys.stderr.write(("Skipping event without long/lat: %(title)s\n" \ # % event).encode(INPUT_ENCODING)) pass elif not event.has_key('time'): sys.stderr.write(("Skipping event without date/time: %(title)s\n" \ % event).encode(INPUT_ENCODING)) elif not is_event_current(event["time"]): # skipping events, that are too old or too far in the future pass else: # the event is valid result.append(event) return result # icon sizes depend on the current date and the date of the event # use the timestamp "None" for the smallest icon def get_icon_size(timestamp): now = time.time() if timestamp is None: return ICON_SIZES["tiny"] if timestamp >= now + 3600 * TIME_OFFSET_HOURS["coming"]: return ICON_SIZES["small"] elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["soon"]: return ICON_SIZES["medium"] else: return ICON_SIZES["big"] def is_event_current(timestamp): now = time.time() if timestamp < now + 3600 * TIME_OFFSET_HOURS["today"]: return False elif timestamp >= now + 3600 * TIME_OFFSET_HOURS["future"]: return False else: return True def get_icon_url(category, timestamp): if category in ICONS.keys(): filename = ICONS[category] else: filename = ICONS["default"] size = get_icon_size(timestamp) return "%s/%s_%d.png" % (ICON_URL_PREFIX, filename, size) def get_gml_from_data(data): result = [] # first line: the header for gml result.append(['point', 'title', 'description', 'icon', 'iconSize', 'iconOffset']) # add one line for each event for place in group_sorted_events_by_location(data): result.append([]) event = place[0] items = result[-1] # the 'point' items.append('%(latitude)s,%(longitude)s' % event) # the 'title' items.append('%(title)s' % event) # the 'description' description = '' items.append(description) # the 'icon' items.append(get_icon_url(event['category'], event['time'])) # 'iconSize' and 'iconOffset' size = get_icon_size(event['time']) # size of the icon items.append('%d,%d' % (size, size)) # offset of the middle of the icon items.append('%d,%d' % (0, -size)) # turn the array into a string return '\n'.join(['\t'.join(event) for event in result]) def unicode2htmlentities(text): result = "" for c in text: if ord(c) < 128: result += c else: result += '&%s;' % htmlentitydefs.codepoint2name[ord(c)] return result def write_to_file(output_file, data): try: con = open(output_file, 'w') except IOError, errmsg: sys.stderr.write("Failed to open output file for writing (%s): %s" % (output_file, errmsg)) sys.stderr.exit(3) try: con.write(unicode2htmlentities(data)) # a final linebreak is necessary - otherwise openlayers ignores the last line con.write("\n") except IOError, errmsg: sys.stderr.write("Failed to write to output file(%s): %s" % (output_file, errmsg)) sys.stderr.exit(4) con.close() def is_same_location(event_a, event_b): """ check if two events are located at the same place beware: for now it does not take the name of the place into account. It only compares latitude and longitude. """ if (event_a["longitude"] == event_b["longitude"]) \ and (event_a["latitude"] == event_b["latitude"]): return True else: return False def group_sorted_events_by_location(events): """ convert a list of events into a list of places - each being a list of events happening at this location. """ places = [] for event in events: # we use this variable to store a (possibly) matching place already_defined_place = None for place in places: if is_same_location(event, place[0]): already_defined_place = place if already_defined_place is None: # create a new place (starting with the current event) places.append([event]) else: # add it to the existing place already_defined_place.append(event) # sort the events of each place by time def cmp_event(x, y): if x["time"] < y["time"]: return -1 elif x["time"] == y["time"]: return 0 else: return 1 for place in places: place.sort(cmp=cmp_event) return places if __name__ == '__main__': if len(sys.argv) > 1: input_file = sys.argv[1] else: input_file = IMPORT_URL if len(sys.argv) > 2: output_file = sys.argv[2] else: output_file = EXPORT_FILE html_data = read_from_url(input_file) data = get_data_from_html(html_data) gml_data = get_gml_from_data(data) write_to_file(output_file, gml_data)