codekasten/readJoerchs.py

78 lines
2.2 KiB
Python
Raw Normal View History

#!/usr/bin/env python
import string
import httplib, urllib
class ReadJoerchs:
'reads films out of a pcmcia htmlfile'
def __init__(self, url, tags):
self.url = url
self.films = tags
pass
def getFilmEntries(self, printfilms):
'returns a 2D list with film data'
opener = urllib.FancyURLopener({})
conn = opener.open(self.url)
wholefile = conn.read()
j = 0
# 1000 zeilen durchlaufen
#TODO: hier fehlt 'ne passende abbruchbedingung
while j<2000:
offset=string.find(wholefile,'<tr>')
starttr = string.find(wholefile,'<tr>',offset)+len('<tr>')
endtr = string.find(wholefile,'</tr>',starttr+offset)
line = wholefile[starttr:endtr]
i = 0
temp = []
# alle <td> felder innerhalb von <tr> bis </tr> durchlaufen
# films enthaelt die anzahl der erwarteten tags
while i <= len(self.films[0]):
offset = string.find(line,'<td class="zelle">')
# kein <td> in dieser zeile gefunden
if offset == -1:
i += 1
break
else:
start = string.find(line,'<td class="zelle">',offset)+len('<td class="zelle">')
end = string.find(line,'</td>',start)
t = self.replacestuff(line[start:end])
# falls das tag jetzt leer sein sollte (&nbsp; entfernt)
if len(t) < 1:
t = " "
temp.append(t)
line = line[end:]
i+=1
if len(temp) == len(self.films[0]):
self.films.append(temp)
wholefile = wholefile[endtr:]
j+=1
return self.films
#ersetzt sowas wie &quot; durch "
def replacestuff(self, data):
'kick the shit out of hell'
#TODO: den muell hier besser gestalten
newstring=string.replace(data,'&nbsp;'," ")
newstring=string.replace(newstring,"&quot;",'"')
newstring=string.replace(newstring,"&uuml;",'ue')
newstring=string.replace(newstring,"&ouml;",'oe')
newstring=string.replace(newstring,"&auml;",'ae')
newstring=string.replace(newstring,"<EFBFBD>","oe")
newstring=string.replace(newstring,"<EFBFBD>","ae")
newstring=string.replace(newstring,"<EFBFBD>","ue")
return (newstring)
if __name__ == "__main__":
''' local testing
url = "file:///home/age/divx.html"
print "reading %s" % url
films = [["filmtitle","language","codec","cds","comment"]]
foo = ReadJoerchs(url, films)
x = foo.getFilmEntries()
print x
'''