From 38b7268e27bef9d41ddd74d5bcf98dd717446fa7 Mon Sep 17 00:00:00 2001 From: lars Date: Tue, 8 Mar 2011 22:01:10 +0000 Subject: [PATCH] encoding issues fixed ferwer warnings --- .../scripts/convert_raw2gml.py | 38 +++++++++++++++---- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/osm/dynamic_markers/scripts/convert_raw2gml.py b/osm/dynamic_markers/scripts/convert_raw2gml.py index c4f05fe..fa7e014 100755 --- a/osm/dynamic_markers/scripts/convert_raw2gml.py +++ b/osm/dynamic_markers/scripts/convert_raw2gml.py @@ -32,10 +32,12 @@ import time import re import locale + IMPORT_URL = "http://stadtgestalten.org/?q=termin-roh" EXPORT_FILE = "events.gml" ICON_URL_PREFIX = "http://stadtgestalten.org/event_map/icons/png" INPUT_ENCODING = 'utf-8' +DATE_ENCODING = 'iso8859-15' # number of additional events to be displayed for the same location MAX_EXTRA_EVENTS = 3 @@ -102,6 +104,23 @@ TIME_OFFSET_HOURS = { } +def htmlentitydecode(s): + # convert html enitities to unicode (taken from: http://wiki.python.org/moin/EscapingHtml) + return re.sub('&(%s);' % '|'.join(htmlentitydefs.name2codepoint), + lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), s) + + +def htmlentityencode(s): + # convert unicode to html enitities (taken from: http://wiki.python.org/moin/EscapingHtml) + result = [] + for char in s: + if ord(char) > 127: + result.append("&#%d;" % ord(char)) + else: + result.append(char) + return u"".join(result) + + class EventParser(HTMLParser.HTMLParser, object): """ This parser extracts the input from the the event page generated by drupal. BEWARE: the drupal page MUST be configured as a table layout with the above order @@ -183,7 +202,7 @@ class EventParser(HTMLParser.HTMLParser, object): pass elif self.current_attribute == COLUMNS['title']: # maybe the title is splitted by an ampersand entity - event['title'] = event.get('title', '') + data + event['title'] = event.get('title', '') + htmlentityencode(data) elif self.current_attribute == COLUMNS['time']: if event.has_key("time"): # the first "time" field is the start @@ -204,13 +223,13 @@ class EventParser(HTMLParser.HTMLParser, object): elif self.current_attribute == COLUMNS['category']: event['category'] = data.strip() elif self.current_attribute == COLUMNS['place']: - event['place'] = event.get('place', '') + data + event['place'] = event.get('place', '') + htmlentityencode(data) elif self.current_attribute == COLUMNS['latitude']: event['latitude'] = data.strip() elif self.current_attribute == COLUMNS['longitude']: event['longitude'] = data.strip() elif self.current_attribute == COLUMNS['organizer']: - event['organizer'] = event.get('organizer', '') + data + event['organizer'] = event.get('organizer', '') + htmlentityencode(data) else: sys.stderr.write("UNKNOWN ATTRIBUTE: %d (%s)\n" % (self.current_attribute, data.encode(INPUT_ENCODING))) @@ -249,7 +268,7 @@ def get_date_string(timestamp): sys.stderr.write("Locales (%s) not found: %s\n" % (LOCALE, err_msg) \ + " Maybe you should run 'aptitude install locales-all' on debian.\n") localtime = time.localtime(timestamp) - return time.strftime(locale.nl_langinfo(locale.D_T_FMT), localtime) + return htmlentityencode(time.strftime(locale.nl_langinfo(locale.D_T_FMT), localtime)) def html_escape(text): @@ -277,8 +296,10 @@ def filtered_events(events): if not event.has_key('title'): sys.stderr.write("Skipping event without a title\n"); elif not (event.has_key('longitude') and event.has_key('latitude')): - sys.stderr.write(("Skipping event without long/lat: %(title)s\n" \ - % event).encode(INPUT_ENCODING)) + # no error messages -> too many mails sent ... + #sys.stderr.write(("Skipping event without long/lat: %(title)s\n" \ + # % event).encode(INPUT_ENCODING)) + pass elif not event.has_key('time'): sys.stderr.write(("Skipping event without date/time: %(title)s\n" \ % event).encode(INPUT_ENCODING)) @@ -360,7 +381,8 @@ def get_gml_from_data(data): description += '
  • ' \ % get_icon_url(other_event["category"], None) description += '%s: %s
  • ' \ - % (get_date_string(other_event["time"]), other_event["event_url"], html_escape(other_event["title"])) + % (html_escape(get_date_string(other_event["time"]).decode(DATE_ENCODING)), + other_event["event_url"], html_escape(other_event["title"])) description += '' description += '' items.append(description) @@ -371,7 +393,7 @@ def get_gml_from_data(data): # size of the icon items.append('%d,%d' % (size, size)) # offset of the middle of the icon - items.append('%d,%d' % (-size/2, -size/2)) + items.append('%d,%d' % (0, -size)) # turn the array into a string return '\n'.join(['\t'.join(event) for event in result])