#!/usr/bin/env python
import string
import httplib, urllib
class ReadJoerchs:
'reads films out of a pcmcia htmlfile'
def __init__(self, url, tags):
self.url = url
self.films = tags
pass
def getFilmEntries(self, printfilms):
'returns a 2D list with film data'
opener = urllib.FancyURLopener({})
conn = opener.open(self.url)
wholefile = conn.read()
j = 0
# 1000 zeilen durchlaufen
#TODO: hier fehlt 'ne passende abbruchbedingung
while j<2000:
offset=string.find(wholefile,'
')
starttr = string.find(wholefile,'
',offset)+len('
')
endtr = string.find(wholefile,'
',starttr+offset)
line = wholefile[starttr:endtr]
i = 0
temp = []
# alle felder innerhalb von | bis
durchlaufen
# films enthaelt die anzahl der erwarteten tags
while i <= len(self.films[0]):
offset = string.find(line,'')
# kein | in dieser zeile gefunden
if offset == -1:
i += 1
break
else:
start = string.find(line,' | ',offset)+len(' | ')
end = string.find(line,' | ',start)
t = self.replacestuff(line[start:end])
# falls das tag jetzt leer sein sollte ( entfernt)
if len(t) < 1:
t = " "
temp.append(t)
line = line[end:]
i+=1
if len(temp) == len(self.films[0]):
self.films.append(temp)
wholefile = wholefile[endtr:]
j+=1
return self.films
#ersetzt sowas wie " durch "
def replacestuff(self, data):
'kick the shit out of hell'
#TODO: den muell hier besser gestalten
newstring=string.replace(data,' '," ")
newstring=string.replace(newstring,""",'"')
newstring=string.replace(newstring,"ü",'ue')
newstring=string.replace(newstring,"ö",'oe')
newstring=string.replace(newstring,"ä",'ae')
newstring=string.replace(newstring,"ö","oe")
newstring=string.replace(newstring,"ä","ae")
newstring=string.replace(newstring,"ü","ue")
return (newstring)
if __name__ == "__main__":
''' local testing
url = "file:///home/age/divx.html"
print "reading %s" % url
films = [["filmtitle","language","codec","cds","comment"]]
foo = ReadJoerchs(url, films)
x = foo.getFilmEntries()
print x
'''