78 lines
2.2 KiB
Python
78 lines
2.2 KiB
Python
|
#!/usr/bin/env python
|
|||
|
import string
|
|||
|
import httplib, urllib
|
|||
|
|
|||
|
class ReadJoerchs:
|
|||
|
'reads films out of a pcmcia htmlfile'
|
|||
|
|
|||
|
def __init__(self, url, tags):
|
|||
|
self.url = url
|
|||
|
self.films = tags
|
|||
|
pass
|
|||
|
|
|||
|
def getFilmEntries(self, printfilms):
|
|||
|
'returns a 2D list with film data'
|
|||
|
opener = urllib.FancyURLopener({})
|
|||
|
conn = opener.open(self.url)
|
|||
|
wholefile = conn.read()
|
|||
|
j = 0
|
|||
|
# 1000 zeilen durchlaufen
|
|||
|
#TODO: hier fehlt 'ne passende abbruchbedingung
|
|||
|
while j<2000:
|
|||
|
offset=string.find(wholefile,'<tr>')
|
|||
|
starttr = string.find(wholefile,'<tr>',offset)+len('<tr>')
|
|||
|
endtr = string.find(wholefile,'</tr>',starttr+offset)
|
|||
|
line = wholefile[starttr:endtr]
|
|||
|
i = 0
|
|||
|
temp = []
|
|||
|
# alle <td> felder innerhalb von <tr> bis </tr> durchlaufen
|
|||
|
# films enthaelt die anzahl der erwarteten tags
|
|||
|
while i <= len(self.films[0]):
|
|||
|
offset = string.find(line,'<td class="zelle">')
|
|||
|
# kein <td> in dieser zeile gefunden
|
|||
|
if offset == -1:
|
|||
|
i += 1
|
|||
|
break
|
|||
|
else:
|
|||
|
start = string.find(line,'<td class="zelle">',offset)+len('<td class="zelle">')
|
|||
|
end = string.find(line,'</td>',start)
|
|||
|
t = self.replacestuff(line[start:end])
|
|||
|
# falls das tag jetzt leer sein sollte ( entfernt)
|
|||
|
if len(t) < 1:
|
|||
|
t = " "
|
|||
|
temp.append(t)
|
|||
|
line = line[end:]
|
|||
|
i+=1
|
|||
|
if len(temp) == len(self.films[0]):
|
|||
|
self.films.append(temp)
|
|||
|
wholefile = wholefile[endtr:]
|
|||
|
j+=1
|
|||
|
return self.films
|
|||
|
|
|||
|
#ersetzt sowas wie " durch "
|
|||
|
def replacestuff(self, data):
|
|||
|
'kick the shit out of hell'
|
|||
|
#TODO: den muell hier besser gestalten
|
|||
|
newstring=string.replace(data,' '," ")
|
|||
|
newstring=string.replace(newstring,""",'"')
|
|||
|
newstring=string.replace(newstring,"ü",'ue')
|
|||
|
newstring=string.replace(newstring,"ö",'oe')
|
|||
|
newstring=string.replace(newstring,"ä",'ae')
|
|||
|
newstring=string.replace(newstring,"<EFBFBD>","oe")
|
|||
|
newstring=string.replace(newstring,"<EFBFBD>","ae")
|
|||
|
newstring=string.replace(newstring,"<EFBFBD>","ue")
|
|||
|
return (newstring)
|
|||
|
|
|||
|
|
|||
|
if __name__ == "__main__":
|
|||
|
''' local testing
|
|||
|
url = "file:///home/age/divx.html"
|
|||
|
print "reading %s" % url
|
|||
|
films = [["filmtitle","language","codec","cds","comment"]]
|
|||
|
|
|||
|
foo = ReadJoerchs(url, films)
|
|||
|
x = foo.getFilmEntries()
|
|||
|
print x
|
|||
|
'''
|
|||
|
|