#!/usr/bin/env python import string import httplib, urllib class ReadJoerchs: 'reads films out of a pcmcia htmlfile' def __init__(self, url, tags): self.url = url self.films = tags pass def getFilmEntries(self, printfilms): 'returns a 2D list with film data' opener = urllib.FancyURLopener({}) conn = opener.open(self.url) wholefile = conn.read() j = 0 # 1000 zeilen durchlaufen #TODO: hier fehlt 'ne passende abbruchbedingung while j<2000: offset=string.find(wholefile,'<tr>') starttr = string.find(wholefile,'<tr>',offset)+len('<tr>') endtr = string.find(wholefile,'</tr>',starttr+offset) line = wholefile[starttr:endtr] i = 0 temp = [] # alle <td> felder innerhalb von <tr> bis </tr> durchlaufen # films enthaelt die anzahl der erwarteten tags while i <= len(self.films[0]): offset = string.find(line,'<td class="zelle">') # kein <td> in dieser zeile gefunden if offset == -1: i += 1 break else: start = string.find(line,'<td class="zelle">',offset)+len('<td class="zelle">') end = string.find(line,'</td>',start) t = self.replacestuff(line[start:end]) # falls das tag jetzt leer sein sollte ( entfernt) if len(t) < 1: t = " " temp.append(t) line = line[end:] i+=1 if len(temp) == len(self.films[0]): self.films.append(temp) wholefile = wholefile[endtr:] j+=1 return self.films #ersetzt sowas wie " durch " def replacestuff(self, data): 'kick the shit out of hell' #TODO: den muell hier besser gestalten newstring=string.replace(data,' '," ") newstring=string.replace(newstring,""",'"') newstring=string.replace(newstring,"ü",'ue') newstring=string.replace(newstring,"ö",'oe') newstring=string.replace(newstring,"ä",'ae') newstring=string.replace(newstring,"�","oe") newstring=string.replace(newstring,"�","ae") newstring=string.replace(newstring,"�","ue") return (newstring) if __name__ == "__main__": ''' local testing url = "file:///home/age/divx.html" print "reading %s" % url films = [["filmtitle","language","codec","cds","comment"]] foo = ReadJoerchs(url, films) x = foo.getFilmEntries() print x '''