codekasten/FilmBar/readJoerchs.py

#!/usr/bin/env python
import string
import httplib, urllib

class ReadJoerchs:
    'reads films out of a pcmcia htmlfile'
    
    def __init__(self, url, tags):
	self.url = url
	self.films = tags
	pass

    def getFilmEntries(self, printfilms):
	'returns a 2D list with film data'
	opener = urllib.FancyURLopener({})
	conn = opener.open(self.url)
	wholefile = conn.read()
	j = 0
	# 1000 zeilen durchlaufen
	#TODO: hier fehlt 'ne passende abbruchbedingung
	while j<2000:
	    offset=string.find(wholefile,'<tr>')
	    starttr = string.find(wholefile,'<tr>',offset)+len('<tr>')
	    endtr = string.find(wholefile,'</tr>',starttr+offset)
	    line = wholefile[starttr:endtr]		
	    i = 0
	    temp = []
	    # alle <td> felder innerhalb von <tr> bis </tr> durchlaufen
	    # films enthaelt die anzahl der erwarteten tags
	    while i <= len(self.films[0]): 
		offset = string.find(line,'<td class="zelle">')
		# kein <td> in dieser zeile gefunden
		if offset == -1:
		    i += 1
		    break
		else:
		    start = string.find(line,'<td class="zelle">',offset)+len('<td class="zelle">')
		    end = string.find(line,'</td>',start)
		    t = self.replacestuff(line[start:end])
		    # falls das tag jetzt leer sein sollte (&nbsp; entfernt)
		    if len(t) < 1:
			t = " "
		    temp.append(t)
		    line = line[end:]
		    i+=1
	    if len(temp) == len(self.films[0]):
		self.films.append(temp)
	    wholefile = wholefile[endtr:]
	    j+=1
	return self.films

    #ersetzt sowas wie &quot; durch "
    def replacestuff(self, data):
	'kick the shit out of hell'
	#TODO: den muell hier besser gestalten
	newstring=string.replace(data,'&nbsp;'," ")
	newstring=string.replace(newstring,"&quot;",'"')
	newstring=string.replace(newstring,"&uuml;",'ue')
	newstring=string.replace(newstring,"&ouml;",'oe')
	newstring=string.replace(newstring,"&auml;",'ae')
	newstring=string.replace(newstring,"<EFBFBD>","oe")
	newstring=string.replace(newstring,"<EFBFBD>","ae")
	newstring=string.replace(newstring,"<EFBFBD>","ue")
	return (newstring)


if __name__ == "__main__":
    ''' local testing
    url = "file:///home/age/divx.html"
    print "reading %s" % url
    films = [["filmtitle","language","codec","cds","comment"]]

    foo = ReadJoerchs(url, films)
    x = foo.getFilmEntries()
    print x
    '''