77 lines
2.2 KiB
Python
Executable file
77 lines
2.2 KiB
Python
Executable file
#!/usr/bin/env python
|
|
import string
|
|
import httplib, urllib
|
|
|
|
class ReadJoerchs:
|
|
'reads films out of a pcmcia htmlfile'
|
|
|
|
def __init__(self, url, tags):
|
|
self.url = url
|
|
self.films = tags
|
|
pass
|
|
|
|
def getFilmEntries(self, printfilms):
|
|
'returns a 2D list with film data'
|
|
opener = urllib.FancyURLopener({})
|
|
conn = opener.open(self.url)
|
|
wholefile = conn.read()
|
|
j = 0
|
|
# 1000 zeilen durchlaufen
|
|
#TODO: hier fehlt 'ne passende abbruchbedingung
|
|
while j<2000:
|
|
offset=string.find(wholefile,'<tr>')
|
|
starttr = string.find(wholefile,'<tr>',offset)+len('<tr>')
|
|
endtr = string.find(wholefile,'</tr>',starttr+offset)
|
|
line = wholefile[starttr:endtr]
|
|
i = 0
|
|
temp = []
|
|
# alle <td> felder innerhalb von <tr> bis </tr> durchlaufen
|
|
# films enthaelt die anzahl der erwarteten tags
|
|
while i <= len(self.films[0]):
|
|
offset = string.find(line,'<td class="zelle">')
|
|
# kein <td> in dieser zeile gefunden
|
|
if offset == -1:
|
|
i += 1
|
|
break
|
|
else:
|
|
start = string.find(line,'<td class="zelle">',offset)+len('<td class="zelle">')
|
|
end = string.find(line,'</td>',start)
|
|
t = self.replacestuff(line[start:end])
|
|
# falls das tag jetzt leer sein sollte ( entfernt)
|
|
if len(t) < 1:
|
|
t = " "
|
|
temp.append(t)
|
|
line = line[end:]
|
|
i+=1
|
|
if len(temp) == len(self.films[0]):
|
|
self.films.append(temp)
|
|
wholefile = wholefile[endtr:]
|
|
j+=1
|
|
return self.films
|
|
|
|
#ersetzt sowas wie " durch "
|
|
def replacestuff(self, data):
|
|
'kick the shit out of hell'
|
|
#TODO: den muell hier besser gestalten
|
|
newstring=string.replace(data,' '," ")
|
|
newstring=string.replace(newstring,""",'"')
|
|
newstring=string.replace(newstring,"ü",'ue')
|
|
newstring=string.replace(newstring,"ö",'oe')
|
|
newstring=string.replace(newstring,"ä",'ae')
|
|
newstring=string.replace(newstring,"ö","oe")
|
|
newstring=string.replace(newstring,"ä","ae")
|
|
newstring=string.replace(newstring,"ü","ue")
|
|
return (newstring)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
''' local testing
|
|
url = "file:///home/age/divx.html"
|
|
print "reading %s" % url
|
|
films = [["filmtitle","language","codec","cds","comment"]]
|
|
|
|
foo = ReadJoerchs(url, films)
|
|
x = foo.getFilmEntries()
|
|
print x
|
|
'''
|
|
|