#! /usr/bin/python # TPG's IPTV xmltv generator # By Jean-Yves Avenard: jean-yves@avenard.org # # Copyright (c) 2008, JEAN-YVES AVENARD # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY JEAN-YVES AVENARD ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL JEAN-YVES AVENARD BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Derived from scraper.py by Michael Foord # Released subject to the BSD License # Please see http://www.voidspace.org.uk/documents/BSD-LICENSE.txt # Scripts maintained at http://www.voidspace.org.uk/python/index.shtml # # Big thanks to Stephen Dredge from TPG for his ongoing support and help # NAME = "tv_grab_au_tpg" VERSION = '0.4.5' DESCRIPTION = "Australia (TPG's IPTV from various sources)" PREFERREDMETHOD = 'allatonce' CAPABILITIES = """baseline manualconfig preferredmethod""" __doc__ = """Export TPG's IPTV channel information This grabber is distributed under the BSD License. Usage: tv_grab_au_tpg [options] Options: -h / --help Print this message and exit. --version Print the version --quiet Suppress all progress information (but there are none anyway). --verbose Display additional information (for debugging only) --configure Does nothing, purely provided to work with mythtv -d / --days Grab N days. Defaults is grabbing as many days of data as is available on the source. --offset Start grabbing N days in the future. Defaults to 0; starting grabbing with today's data. -o / --output By default, the output is directed to stdout. Write standard output to filename instead. --capabilities List XMLTV capabilities. --pretty Output XML in pretty/readable mode. --preferredmethod Tell the calling program that we prefer to return all the data at once. --description Tell everyone that we're an Australian grabber. """ import sys import os import re import string import StringIO from xml.dom.minidom import Document import xml.dom.minidom import getopt import datetime import pytz import copy import htmlentitydefs #For opening the URL import socket import urllib2 import urllib import cookielib #Define which channels are active #grab_channel = { # 'tv5' : False, 'aljazeera': False, 'tve': False , 'dw': False, 'bvn': False, # 'duna': False, 'trtint': True, 'cuba' : False, 'sctv': False, 'newsasia' : False, # 'rt' : False, 'ertworld' : False, 'rtm': False, 'voa': False, 'eurosport': False, 'eurosportnews': False, # 'bloomberg': False, 'tpg': False, 'thaitv5': False, 'vtv4' : False, 'tvri' : False, 'ddsport' : False, # 'worldfashion' : False, 'cctv': False, 'tpgnasa': False # } grab_channel = { 'tv5' : True, 'aljazeera': True, 'tve': True , 'dw': True, 'bvn': True, 'duna': True, 'trtint': True, 'cuba' : True, 'sctv': True, 'newsasia' : True, 'rt' : True, 'ertworld' : True, 'rtm': True, 'voa': True,'eurosport': True, 'eurosportnews': True, 'bloomberg': True, 'tpg': True, 'thaitv5' : True, 'vtv4': True, 'tvri' : True, 'worldfashion': True, 'cctv': True, 'tpgnasa': True } time_zone = { 'SA' : [930,1,238] , 'WA' : [800,1,241], 'NT' : [930,0,336], 'ACT' : [1000,1,15], 'VIC' : [1000,1,240], 'NSW' : [1000,1,235], 'QLD' : [1000,0,237], 'TAS' : [1000,1,239] } #TV5 global settings tv5url = 'http://www.tv5.org/TV5Site/programmes/universal/pop_print.php?descr=1' cookietv5 = { 'cid_signal':'4', 'cid_zone': '240', 'c_id_signal':'4', 'c_id_version':'1', 'c_id_zone':'240' , 'c_id_trad': '3' } tv5channelname = 'TV5' tv5idname = 'tv5' maxtv5cookie = 1269604634 #Valid until somewhere in 2010 tv5mainurl = 'www.tv5.org' statetv5 = [ ['ul','li','_date_','ul','li','_read_',-1], ['li','strong','_time_','a','_title_','em','_category_',-1], ['li','strong','_time_','a','_title_','em','_category_','span','p','_desc_',-1], ['li','strong','_time_','a','_title_','em','_category_','span','p','span','_subtitle_','_desc_',-1], ['li','strong','_time_','a','_title_','em','_category_','span','p','span','_subtitle_',-1], ['li','strong','_time_','a','_title_','em','_category_','img','span','p','_desc_',-1], ['li','strong','_time_','a','_title_','em','_category_','img','span','p','span','_subtitle_','_desc_',-1], ['li','strong','_time_','a','_title_','em','_category_','img','img','span','p','_desc_',-1], ['li','strong','_time_','a','_title_','em','_category_','img','img','span','p','span','_subtitle_','_desc_',-1], ] #Al Jazeera settings ajurl = 'http://xmltv.radiotimes.com/xmltv/2055.dat' ajchannelname = 'Al Jazeera' ajidname = 'aljazeera' #TVE International settings tveurl = 'http://www.rtve.es/tve/programo/avan3/tv3s' tvechannelname = 'TVE International' tveidname = 'tve' statetve = [ ['strong','u','_date_','_title_',-1], ['strong','u','_date_','_title_','_desc_',-1], ['strong','_title_',-1], ['strong','_title_','_desc_',-1], ['strong','_title_','strong','_subtitle_',-1], ['strong','_title_','strong','_subtitle_','_desc_',-1]] #lyngsat format: 0: id, 1: url, 2: name, 3: language, 4: timezone , 5: dst lyngsat = [ # [ 'dw', 'http://www.lyngsat-guide.com/DW-TV.html', 'DW-TV', 'de', 200, 0 ], # [ 'bvn', 'http://www.lyngsat-guide.com/BVN-TV.html', 'Dutch TV', 'nl', 200, 0 ], # [ 'cctv4', 'http://www.lyngsat-guide.com/CCTV-4.html', 'Chinese TV 1' ,'cn', 200, 0 ], # [ 'duna', 'http://www.lyngsat-guide.com/Duna-TV.html', 'Hungarian TV' ,'hu', 200, 0 ], # [ 'trtint', 'http://www.lyngsat-guide.com/TRT-International.html', 'TRT International' ,'tr', 200, 0 ], ] statelyngsat = [ ['tr','td','td','font808080','b','_time_','td','font808080','b','_title_','font808080','_desc_',-1], ['tr','td','td','fontblack','b','_time_','td','fontblack','b','_title_','fontblack','_desc_',-1], ['tr','td','td','fontblack','b','_time_','td','fontblack','b','_title_','img','fontblack','_desc_',-1], ] #Cubavision settings cubaurl = 'http://www.cubavision.cubaweb.cu/satelite.asp' cubachannelname = 'Cuba Vision' cubaidname = 'cuba' statecuba = [ ['table','tr','td','_read_', 'td','_time_','td','_read_','td','_title_', 'td', -1], ] #SCTV settings sctvurl = 'http://www.communitytv.com.au/p3.htm' sctvchannelname = 'SCTV' sctvidname = 'sctv' statesctv = [ ['strong','_date_',-1], ['br','_title_',-1], ['span','_title_',-1], ] #Channel NewsAsia newsasiaurl = 'http://www.mediacorpsingapore.com/tvguide/cnasing_list_daily.asp' newsasiachannelname = 'Channel NewsAsia' newsasiaidname = 'newsasia' statenewsasia = [ [ 'p','font','_date_','table',-1], [ 'tr','td','font','_time_','td','font','_title_','font','_subtitle_','font','i','_desc_',-1], [ 'tr','td','font','_time_','td','font','_title_','font','_subtitle_',-1], ] #Russia Today russiaurl = 'http://www.russiatoday.ru/schedule/date' russiachannelname = 'Russia Today' russiaidname = 'rt' staterussia = [ ['_time_','br',-1], ['_time_','a','_desc_','br',-1], ] #Greek TV ertworldurl = 'http://tvradio.ert.gr/en/worldprogram.asp' ertworldchannelname = 'Greek TV' ertworldidname = 'ertworld' stateertworld = [ [ 'tr','td','td','div','_time_', 'br',-1 ], [ 'br','_time_', 'br',-1 ], ] #Malaysian TV rtmurl = 'http://www.rtm.net.my/tvschedule/bi/schedule1.php' rtmchannelname = 'Malaysian TV' rtmidname = 'rtm' statertm = [ [ 'tr', 'td', 'span', '_date_', -1 ], [ 'tr', 'td', 'font333333','span','_time_','td','span','_title_', -1 ], ] #Voice of America voaurl = 'http://ibb7-2.ibb.gov/tvschedule/getlisting.cfm' voachannelname = 'Voice of America' voaidname = 'voa' statevoa = [ [ 'tr', 'td', 'b', '_data_', '_data2_', -1 ], [ 'tr', 'td', 'b', '_data_', '_data2_', 'b', '_data_', '_data2_', -1 ], [ 'tr', 'td', 'b', '_title_', 'a', '_title2_', -1 ], [ 'tr', 'td', 'b', '_title_', 'a', '_title2_', '_title3_', -1 ], ] #Classic, Playboy and Adult One tpgurl = 'http://tpg.com.au/iptv/guide_playboy_adult_classic.xmltv' tpgnasaurl = 'http://tpg.com.au/iptv/guide_nasatv.xmltv' #Eurosport eurosporturl = 'http://yahoo.eurosport.com/tvschedule_clng0' eurosportchannelname = 'Eurosport' eurosportidname = 'eurosport' stateeurosport = [ [ 'table', 'tr', 'td', 'div', '_ignore_', 'div', 'span', '_ignore_', -1], [ 'td', 'div', '_time_', 'div', 'span', '_title_', 'div', 'p', 'a', '_desc_', -1], ] #Eurosport News eurosportnewschannelname = 'Eurosport News' eurosportnewsidname = 'eurosportnews' #Bloomberg bloombergurl = 'http://www.bloomberg.com/tvradio/tv/schedule_asia' bloombergchannelname = 'Bloomberg Television' bloombergidname = 'bloomberg' statebloomberg = [ [ 'table', 'tr', 'td', 'span', '_ignoredate_', 'td', 'span', '_ignoretime_', 'td', 'span', '_ignoretitle_', 'td', 'span', '_ignoredesc_', -1 ], [ 'tr', 'td', 'span', '_ignoredate_', 'td', 'span', '_time_', 'td', 'span', '_title_', 'td', 'span', '_desc_', -1 ], ] #Thai TV5 thaitv5url = 'http://www.tv5.co.th/programs/day.php' thaitv5channelname = 'Thai TV5' thaitv5idname = 'thaitv5' statethaitv5 = [ [ 'aentry', '_time_' , -1], ] #VTV 4: vtv4url = 'http://www.vtv.vn/VN/TrangChu/LichPhatSong/VTV4' vtv4channelname = 'VTV4' vtv4idname = 'vtv4' statevtv4 = [ [ 'divschedule' , 'divtime' , '_time_' , 'divprogram', '_title_', 'divcontent', 'script', '_ignore_', '_desc_', -1 ] ] #TVRI tvriurl = 'http://www.tvri.co.id' tvrichannelname = 'Indonesian TV' tvriidname = 'tvri' statetvri = [ [ 'tr', 'td', 'div', '_time_', 'td', 'td', 'a', '_title_', -1 ], ] #World Fashion wofurl = 'http://www.wfc.tv/pl/' wofchannelname = 'World Of Fashion' wofidname = 'worldfashion' statewof = [ [ 'tr', 'td', 'span', '_time_', '_ignore_', 'td', '_ignore_', 'td', 'a', '_title_', -1 ], ] #CCTV cctvchannels = [ { 'id': 'cctv4', 'name': 'Chinese TV1', 'number': 5 }, { 'id': 'cctv9', 'name': 'Chinese TV2', 'number': 12 }, { 'id': 'cctv1', 'name': 'Chinese TV3', 'number': 2 }, { 'id': 'cctv-music', 'name': 'Chinese TV4', 'number': 18 }, { 'id': 'cctv11', 'name': 'Chinese TV5', 'number': 14 }, { 'id': 'cctv2', 'name': 'Chinese TV6', 'number': 3 }, ] cctvurl= 'http://tv.cctv.com/soushi/05' googletranslatechinese = 'http://209.85.171.104/translate_c?hl=en&langpair=zh|en&u=' statecctv = [ [ 'spantime', '_time_', 'div', 'span', 'spangoogle-src-text', 'a', 'spantitle', '_desc_', 'a', 'spantitle', '_title_', -1 ], ] #DW dwurl = 'http://www9.dw-world.de/regionalisierung/programm.php' dwchannelname = 'DW-TV' dwidname = 'dw' statedw = [ [ 'tdtimecellday[0-9]', '_time_', 'tdbcasttitle', 'span', 'a', '_title_', 'spanbcastdesc', '_desc_', -1 ], ] #TRT trturl = 'http://www.trt.net.tr/TV/TvAkis.aspx' trtchannelname = 'TRT International' trtidname = 'trtint' statetrt = [ [ 'tr', 'tdprogramsaat', '_time_', 'tdprogramad', '_title_', -1 ], [ 'tr', 'tdprogramsaat', '_time_', 'tdprogramad', 'a', '_title_', -1 ], ] #BVN bvnurl = 'http://www.bvn.nl/tvgids/tvgids' bvnchannelname = 'Dutch TV' bvnidname = 'bvn' #Duna dunaurl = 'http://www.dunatv.hu/felsomenu/musorujsag' dunachannelname = 'Hungarian TV' dunaidname = 'duna' stateduna = [ [ 'divmusortime' , '_time_', -1 ], [ 'divmusortitle', '_title_', -1 ], [ 'divmusorsubtitle', '_subtitle_', -1], [ 'divmusorgenre', '_desc_', -1 ], [ 'divmusorsynops', '_desc_', -1 ], [ 'divmusorkozrem', '_desc_', -1 ], [ 'divmusorepisodnum', '_desc_', -1 ], ] #default setting values maxdays = 7 day_offset = 0 default_config_dir = "~/.xmltv" config_file = "" verbose = 0 item_config = { 'TimeZone' : [('SA','WA','NT','ACT','VIC','NSW','QLD','TAS'),'VIC'] } monthwinter = 3 monthsummer = 10 localtimezone = 1000 dst = 0 idprefix = 'tpg.' converttz = False #namefind is supposed to match a tag name and attributes into groups 1 and 2 respectively. #the original version of this pattern: # namefind = re.compile(r'(\S*)\s*(.+)', re.DOTALL) #insists that there must be attributes and if necessary will steal the last character #of the tag name to make it so. this is annoying, so let us try: namefind = re.compile(r'(\S+)\s*(.*)', re.DOTALL) attrfind = re.compile( r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*' r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?') # this is taken from sgmllib def usage(): print __doc__ class Error(Exception): #Code: #1xx : configuration error #2xx : url fetching error #3xx : content issue #4xx : post-processed issue def __init__(self, message,code): self.message = message self.code = code def __str__(self): return repr(self.message) class Config(object): def __init__(self, config_file, init): self.config = {} if not config_file: config_dir = os.path.expanduser(default_config_dir) if init: # Create config directory if it doesn't exist if not os.path.isdir(config_dir): try: os.mkdir(config_dir) except: raise Error('Failed to create config directory: %s' % config_dir, 100) self.config_file = os.path.join(config_dir, '%s.conf' % NAME) else: self.config_file = os.path.expanduser(config_file) if not init: try: f = open(self.config_file) except IOError: self.exists = False else: self.exists = True for line in f: m = re.compile(r"([a-zA-Z0-9\s]+):([a-zA-Z0-9]+)$").match(line) if m: self.config[m.group(1)] = m.group(2) else: raise Error("Invalid config file, run with --configure again",101) def initconfig(self): answer = {} for x in item_config: option = '' for i in item_config[x][0]: option += i + ',' valid = 0 while not valid: answer[x] = raw_input('%s - %s: (%s)\t:' % ( x , option, item_config[x][1]) ) #Set default answer if answer[x] == '': answer[x] = item_config[x][1] valid = 1 else: #Check if answer is valid for i in item_config[x][0]: if i == answer[x]: valid = 1 break if not valid: print >> sys.stderr, 'Incorrect Answer' self.config = answer def write(self): try: f = open(self.config_file, 'w') except IOError: raise Error("Couldn't write config file",102) else: answer = self.config for x in answer: f.write('%s:%s\n' % (x , answer[x])) f.close() class Scraper: def __init__(self, arg1, completefont=True, completeclass=False): """Initialise a parser.""" self.buffer = '' self.outfile = '' self.processed = [] self.matchtable = arg1 #Initialise state table. self.workingtable = [] self.statetable = [] self.htmlpos = 0 self.completedblock = [] self.completefont = completefont self.completeclass = completeclass def reset(self): """This method clears the input buffer and the output buffer.""" self.buffer = '' self.outfile = '' self.processed = [] self.matchtable = [] self.workingtable = [] self.statetable = [] def push(self): """This returns all currently processed data and empties the output buffer.""" data = self.outfile self.outfile = '' return data def close(self): """Returns any unprocessed data (without processing it) and resets the parser. Should be used after all the data has been handled using feed and then collected with push. This returns any trailing data that can't be processed. If you are processing everything in one go you can safely use this method to return everything. """ data = self.push() + self.buffer self.buffer = '' self.processed = [] self.matchtable = [] self.workingtable = [] self.statetable = [] return data def feed(self, data,ignoretag=None, emptyfield=True, replacecr=False, replacecrval=' '): """Pass more data into the parser. As much as possible is processed - but nothing is returned from this method. """ self.index = -1 self.tempindex = 0 self.buffer = self.buffer + data self.emptyfield = emptyfield self.replacecr = replacecr self.replacecrval = replacecrval outlist = [] thischunk = [] while self.index < len(self.buffer)-1: # rewrite with a list of all the occurences of '<' and jump between them, much faster than character by character - which is fast enough to be fair... self.index += 1 inchar = self.buffer[self.index] if inchar == '<': ok, result, attrs, thetag = self.tagstart(ignoretag) #If tag is to be ignored, replace it with a '\n' if ok and ignoretag and re.compile(ignoretag).match(result.lower()): thischunk.append('\n') else: outlist.append(self.pdata(''.join(thischunk))) thischunk = [] if ok: result = self.handletag(result, attrs, thetag) if result: outlist.append(result) if self.tempindex: break else: thischunk.append(inchar) if self.tempindex: self.buffer = self.buffer[self.tempindex:] else: self.buffer = '' if thischunk: self.buffer = ''.join(thischunk) self.outfile = self.outfile + ''.join(outlist) def tagstart(self,ignoretag): """We have reached the start of a tag. self.buffer is the data self.index is the point we have reached. This function should extract the tag name and all attributes - and then handle them !.""" test1 = self.buffer.find('>', self.index+1) test2 = self.buffer.find('<', self.index+1) # will only happen for broken tags with a missing '>' test1 += 1 test2 += 1 if not test2 and not test1: self.tempindex = self.index # if we get this far the buffer is incomplete (the tag doesn't close yet) self.index = len(self.buffer) # this signals to feed that some of the buffer needs saving return False, False, 0, 0 if test1 and test2: test = min(test1, test2) if test == test2: # if the closing tag is missing and we're working from the next starting tag - we eed to be careful with our index position... mod=1 else: mod=0 else: test = test1 or test2 if test2: mod=1 else: mod=0 thetag = self.buffer[self.index+1:test-1].strip() if mod: # as soon as we return, the index will have 1 added to it straight away self.index = test -2 else: self.index = test -1 if thetag.startswith('!'): # is a declaration or comment return False, self.pdecl(thetag), 0, 0 if thetag.startswith('?'): return False, self.ppi(thetag), 0, 0 # is a processing instruction if thetag.startswith('/') and not (ignoretag and re.compile(ignoretag).match(thetag.lower()[1:])): return False, self.endtag(thetag), 0, 0 # is an endtag #Remove leading / as this tag is going to be ignored anyway if thetag.startswith('/'): thetag = thetag[1:] nt = namefind.match(thetag) if not nt: return False, self.emptytag(thetag), 0, 0 # nothing inside the tag ? name, attributes = nt.group(1,2) matchlist = attrfind.findall(attributes) attrs = [] #the doc says a tag must be nameless to be "empty" so kill #next line that calls any tag with no attributes "empty" #if not matchlist: return self.emptytag(thetag) # nothing inside the tag ? for entry in matchlist: attrname, rest, attrvalue = entry # this little chunk nicked from sgmllib - except findall is used to match all the attributes if not rest: attrvalue = attrname elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ attrvalue[:1] == '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] attrs.append((attrname.lower(), attrvalue)) n = name.lower() return True, name.lower(), attrs, thetag # deal with what we've found. def checkifcompleted(self,state,i,completedblock): matchtable = self.matchtable #Check if we have reached the end of the table, if so, add the completed block to the completed list if matchtable[state[i]['linematch']][state[i]['position']] == -1: #Remove all previously completed block starting at or after the newly completed block for j in reversed(range(len(completedblock))): if completedblock[j]['begin'] >= state[i]['begin']: if verbose > 2: print 'deleted completed block %d' % j del completedblock[j] #Add as completed block completedblock.append({}) completedblock[-1]['end'] = self.htmlpos completedblock[-1]['begin'] = state[i]['begin'] completedblock[-1]['content'] = state[i]['content'] if verbose > 2: print 'Add completed block %d and delete state %d (match[%d])' % (len(completedblock),i,state[i]['linematch']) #We have completed this match, delete it del state[i] def scancompleted(self,state,completedblock): #Scan the completed blocks, if any current state start strictly before this completed block finished, delete the state #if all current states start stricly after when the block started: accept block for i in reversed(range(len(completedblock))): accepted = True for j in reversed(range(len(state))): if state[j]['begin'] > completedblock[i]['begin'] and state[j]['begin'] < completedblock[i]['end']: if verbose > 2: print "delete state %d (match[%d]) content=%s" % (j , state[j]['linematch'], state[j]['content']) del state[j] elif state[j]['begin'] <= completedblock[i]['begin']: accepted = False if accepted == True: self.processed += completedblock[i]['content'] if verbose > 2: print 'Validate completedblock %d and add %s' % (i , completedblock[i]['content']) del completedblock[i] ################################################################################################ # The following methods are called to handle the various HTML elements. # They are intended to be overridden in subclasses. def pdata(self, inchunk): """Called when we encounter a new tag. All the unprocessed data since the last tag is passed to this method. Dummy method to override. Just returns the data unchanged.""" state = self.statetable matchtable = self.matchtable completedblock = self.completedblock if not re.compile(r"\s*$").match(inchunk): #Ignore empty lines if verbose > 1: print 'new content:', inchunk p = re.compile(r"_([a-z0-9]+)_") #Start new entry in statetable if it expects some content for i in range(len(matchtable)): if p.match(matchtable[i][0]): state.append({}) state[-1]['linematch'] = i state[-1]['begin'] = self.htmlpos state[-1]['position'] = 0 state[-1]['content'] = [] if verbose > 2: print 'creating new state %d starting at position %d for matchline %d: ' % (len(state),self.htmlpos, i) if len(state) > 0: for i in reversed(range(len(state))): m = p.match(matchtable[state[i]['linematch']][state[i]['position']]) if m: state[i]['content'].append(m.group(1)) #Remove some special characters if self.replacecr: inchunk = string.replace(inchunk,'\r',self.replacecrval) inchunk = string.replace(inchunk,'\n',self.replacecrval) inchunk = string.replace(inchunk,'\r','\n') #remove html space character inchunk = string.replace(inchunk, '\xc2\xa0', ' ') state[i]['content'].append(inchunk) state[i]['position'] += 1 if verbose > 2: print 'progressing state %d (match[%d]), now in position %d' % (i,state[i]['linematch'],state[i]['position']) print "state[%d]['content']=%s" % (i,state[i]['content']) self.checkifcompleted(state, i, completedblock) else: if verbose > 2: print 'delete state %d (match[%d]), it has failed to progress at position %d' % (i,state[i]['linematch'],state[i]['position']) del state[i] self.scancompleted(state,completedblock) self.htmlpos += 1 else: #Handle the case were content is empty and we were expected a field. if self.emptyfield and (len(state) > 0): for i in reversed(range(len(state))): m = re.compile(r"_([a-z0-9]+)_").match(matchtable[state[i]['linematch']][state[i]['position']]) if m: if verbose > 1: print 'empty new content when we were expecting a field to fill', inchunk state[i]['content'].append(m.group(1)) #Remove some special characters inchunk = string.replace(inchunk,'\r','') inchunk = string.replace(inchunk,'\n','') state[i]['content'].append(inchunk) state[i]['position'] += 1 if verbose > 2: print 'progressing state %d (match[%d]), now in position %d' % (i,state[i]['linematch'],state[i]['position']) print "state[%d]['content']=%s" % (i,state[i]['content']) self.checkifcompleted(state, i, completedblock) self.scancompleted(state,completedblock) self.htmlpos += 1 return inchunk def pdecl(self, thetag): """Called when we encounter the *start* of a declaration or comment. 1: print 'comment 1: print 'processing 1: print 'end tag: ' + thetag return '<' + thetag + '>' def emptytag(self, thetag): """Called when we encounter a tag that we can't extract any valid name or attributes from. It is passed the tag contents and just returns it.""" return '<' + thetag + '>' def handletag(self, name, attrs, thetag): """Called when we encounter a tag. Is passed the tag name and a list of (attrname, attrvalue) - and the original tag contents as a string.""" state = self.statetable matchtable = self.matchtable completedblock = self.completedblock if self.completefont and name == "font": m = re.compile("""color=['"]?#?([0-9a-f]+|[a-zA-Z]+)['"]?(\s|$)""").search(thetag.lower()) if m: name = name + m.group(1) if self.completeclass: m = re.compile("""class=['"]?(\S+?)['"]?(\s|$)""").search(thetag.lower()) if m: name = name + m.group(1) if verbose > 1: print 'new tag', name #Start new entry in statetable if it matches the first entry in the respective match table for i in range(len(matchtable)): if re.compile(matchtable[i][0]).match(name): state.append({}) state[-1]['linematch'] = i state[-1]['begin'] = self.htmlpos state[-1]['position'] = 0 state[-1]['content'] = [] if verbose > 2: print 'creating new state %d starting at position %d for matchline %d: ' % (len(state),self.htmlpos, i) #advance all pointers in the statetable, or remove the state if it stopped matching for i in reversed(range(len(state))): if re.compile(matchtable[state[i]['linematch']][state[i]['position']]).match(name): state[i]['position'] += 1 if verbose > 2: print 'progressing state %d (match[%d]), now in position %d' % (i,state[i]['linematch'],state[i]['position']) self.checkifcompleted(state, i, completedblock) else: if verbose > 2: print 'delete state %d (match[%d]), it has failed to progress at position %d' % (i,state[i]['linematch'],state[i]['position']) del state[i] self.scancompleted(state,completedblock) self.htmlpos += 1 return '<' + thetag + '>' class Web: def __init__(self, url, timeout=30, data=None): self.handle = None if data: txdata = urllib.urlencode(data) else: txdata = None txheaders = {'User-agent' : 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'} # timeout in seconds socket.setdefaulttimeout(timeout) try: req = urllib2.Request(url, txdata, txheaders) # create a request object self.handle = urllib2.urlopen(req) except IOError, e: errorstr = 'We failed to open "%s".' % url if hasattr(e, 'code'): errorstr += '\nWe failed with error code - %s.' % e.code elif hasattr(e, 'reason'): errorstr += "\nThe error object has the following 'reason' attribute :\n" errorstr += repr(e.reason) errorstr += "\nThis usually means the server doesn't exist, is down, or we don't have an internet connection." raise Error(errorstr,200) def read(self): return self.handle.read() def close(self): return self.handle.close() def readline(self): return self.handle.readline() class XMLTV: def __init__(self): self.doc , self.tv_xml = self.startdoc() def startdoc(self): disclaimer= """ Important Disclaimer: All data provided is the copyright of the original provider we're fetching the source from. You are required to check with each provider the acceptable terms of use and the various legal obligations you must adhere to. The data is usually restricted to personal use only. The author of this xmltv file makes no guarantee of any kind nor does he endorse the content. The author explicitely distances himself from the content provided by this xmltv file Information collected from: Al Jazeera: www.radiotimes.com Bloomberg: http://www.bloomberg.com Channel NewsAsia: http://www.channelnewsasia.com World of Fashion: http://www.worldfashion.tv Eurosport: http://yahoo.eurosport.com DW: http://www.dw-world.de BVN: http://www.bvn.nl Duna : http://www.dunatv.hu TRT International: http://www.trt.net.tr TVE: www.rtve.es CCTV channels: http://www.cctv.com Cuba Vision: http://www.cubavision.cubaweb.cu ERT World: http://tvradio.ert.gr TVRI: http://www.tvri.co.id RTM (Malaysia): http://www.rtm.net.my Russia Today: http://www.russiatoday.ru SCTV: http://www.communitytv.com.au TV5 : www.tv5.org Voice of America: http://www.ibb.gov VTV4: http://www.vtv.vn Thai TV5: http://www.tv5.co.th NASA TV, Classic, Playboy and Adult One: From TPG xmltv feed If you notice errors, missing channels or discrepancies, thank you for contacting the author: reg-jya-xmltv@avenard.org """ # Create the xml document doc = Document() #Add disclaimer to xmltv disclaimer_xml = doc.createComment(disclaimer) doc.appendChild(disclaimer_xml) # Create the base element tv_xml = doc.createElement("tv") tv_xml.setAttribute("source-info-name","TPG's IPTV from various web sites") tv_xml.setAttribute("generator-info-name","JY Avenard TPG's IPTV xmltv generator") tv_xml.setAttribute("generator-info-url", "http://www.avenard.org/iptv/") doc.appendChild(tv_xml) return doc, tv_xml def filterxmltv(self, data, timezone, offset=day_offset, days=maxdays, removepattern=False): """Create xmltv file containing only relevant programmes and channels""" docxml = xml.dom.minidom.parse(StringIO.StringIO(data)) doc , tv_xml = self.startdoc() #Build min and max time based on time offset and max days #If offset is 0, retrieve all the content available from the source delta = datetime.timedelta(days=offset) #Pytz has inverted timezone tz = pytz.timezone('Etc/GMT%+d' % ( -timezone / 100)) mindate = datetime.datetime.now(tz=tz) + delta maxdate = mindate + datetime.timedelta(days=days) mindate = mindate.year * 10000000000 + mindate.month * 100000000 + mindate.day * 1000000 maxdate = maxdate.year * 10000000000 + maxdate.month * 100000000 + maxdate.day * 1000000 p1 = re.compile(r'([0-9]+)\s*([\+|-]([0-9]+))?') if removepattern: pattern = re.compile(removepattern[0]) #Scan the programme list and flag the used channels okchannellist = {} programmelist = docxml.getElementsByTagName('programme') for x in programmelist: ignore = False channel = '' for y in x.attributes.keys(): name = x.attributes[y].name value = x.attributes[y].value if name == 'start': #If before --offset, ignore m = p1.match(value) if not m: errorstr = 'Invalid Programme Start Date. Abort (%s)' % value raise Error(errorstr, 500) if int(m.group(1)) < mindate: ignore = True break #If before --offset, ignore if int(m.group(1)) >= maxdate: ignore = True break if name == 'channel': channel = value if not ignore: if verbose > 1: print 'adding' , channel, 'in valid channel list' okchannellist[channel] = True if verbose > 1: print okchannellist #Create the channel list channellist = docxml.getElementsByTagName('channel') for x in channellist: ignore = False for y in x.attributes.keys(): if x.attributes[y].name == 'id': value = x.attributes[y].value if okchannellist.has_key(value): if verbose > 1: print 'Adding channel' , value else: if verbose > 1: print 'Ignoring channel' , value ignore = True break if not ignore: #Copy the original attributes elem = doc.createElement("channel") for y in x.attributes.keys(): name = x.attributes[y].name value = x.attributes[y].value if name == 'id' and removepattern: value = pattern.sub(removepattern[1],value) elem.setAttribute(name,value) for y in x.getElementsByTagName('display-name'): elem.appendChild(y.cloneNode(True)) tv_xml.appendChild(elem) programmelist = docxml.getElementsByTagName('programme') for x in programmelist: ignore = False elem = doc.createElement("programme") #Copy the original attributes for y in x.attributes.keys(): name = x.attributes[y].name value = x.attributes[y].value if name == 'start': #If before --offset, ignore m = p1.match(value) if int(m.group(1)) < mindate: ignore = True break #If before --offset, ignore if int(m.group(1)) >= maxdate: ignore = True break if name == 'channel' and removepattern: value = pattern.sub(removepattern[1],value) elem.setAttribute(name,value) if not ignore: list_programmeelement = [ 'title', 'sub-title', 'desc', 'credits', 'date', 'category', 'language', 'orig-language', 'length', 'icon', 'url', 'country', 'episode-num', 'video', 'audio', 'previously-shown', 'premiere', 'last-chance', 'new', 'subtitles', 'rating', 'star-rating' ] #copy all the previous properties in the right order for y in list_programmeelement: for z in x.getElementsByTagName(y): elem.appendChild(z.cloneNode(True)) tv_xml.appendChild(elem) return doc def mergechannels(self, docxml): #Create the channel list channellist = docxml.getElementsByTagName('channel') for x in channellist: #Copy the original attributes elem = docxml.createElement("channel") for y in x.attributes.keys(): elem.setAttribute(x.attributes[y].name,x.attributes[y].value) for y in x.getElementsByTagName('display-name'): elem.appendChild(y.cloneNode(True)) self.tv_xml.appendChild(elem) def mergeprogrammes(self, docxml): programmelist = docxml.getElementsByTagName('programme') for x in programmelist: elem = self.doc.createElement("programme") #Copy the original attributes for y in x.attributes.keys(): name = x.attributes[y].name value = x.attributes[y].value elem.setAttribute(name,value) list_programmeelement = [ 'title', 'sub-title', 'desc', 'credits', 'date', 'category', 'language', 'orig-language', 'length', 'icon', 'url', 'country', 'episode-num', 'video', 'audio', 'previously-shown', 'premiere', 'last-chance', 'new', 'subtitles', 'rating', 'star-rating' ] #copy all the previous properties in the right order for y in list_programmeelement: for z in x.getElementsByTagName(y): elem.appendChild(z.cloneNode(True)) self.tv_xml.appendChild(elem) def adjusttimezone(self, docxml, timezone): doc , tv_xml = self.startdoc() m = re.compile(r'([+-]?([0-9]{1,2})([0-9]{2}))').match(`timezone`) tz = int(m.group(1)) tzh = int(m.group(2)) tzm = int(m.group(3)) if tz < 0: tzm = -tzm tzh = -tzh tzdelta = datetime.timedelta(hours=tzh,minutes=tzm) p1 = re.compile(r'([0-9]{4})([0-9]{2})([0-9]{2})([0-9]{2})([0-9]{2})([0-9]{2})\s*(([\+|-])([0-9]{2})([0-9]{2}))') #Create the channel list channellist = docxml.getElementsByTagName('channel') for x in channellist: #Copy the original attributes elem = doc.createElement("channel") for y in x.attributes.keys(): name = x.attributes[y].name value = x.attributes[y].value elem.setAttribute(name,value) for y in x.getElementsByTagName('display-name'): elem.appendChild(y.cloneNode(True)) tv_xml.appendChild(elem) programmelist = docxml.getElementsByTagName('programme') for x in programmelist: elem = doc.createElement("programme") #Copy the original attributes for y in x.attributes.keys(): name = x.attributes[y].name value = x.attributes[y].value if name == 'start' or name == 'stop': m1 = p1.match(value) date = datetime.datetime(int(m1.group(1)), int(m1.group(2)), int(m1.group(3)), int(m1.group(4)), int(m1.group(5)), int(m1.group(6))) #Put back date in UTC if m1.group(7): if m1.group(8) == '+': date -= datetime.timedelta(hours=int(m1.group(9)),minutes=int(m1.group(10))) else: date += datetime.timedelta(hours=int(m1.group(9)),minutes=int(m1.group(10))) date += tzdelta startdate = date.year * 10000000000 + date.month * 100000000 + date.day * 1000000 + date.hour * 10000 + date.minute * 100 value = '%014d' % startdate + ' %+05d' % tz elem.setAttribute(name,value) list_programmeelement = [ 'title', 'sub-title', 'desc', 'credits', 'date', 'category', 'language', 'orig-language', 'length', 'icon', 'url', 'country', 'episode-num', 'video', 'audio', 'previously-shown', 'premiere', 'last-chance', 'new', 'subtitles', 'rating', 'star-rating' ] #copy all the previous properties in the right order for y in list_programmeelement: for z in x.getElementsByTagName(y): elem.appendChild(z.cloneNode(True)) tv_xml.appendChild(elem) return doc def scangenre(self, tab): genre = { 'news' : 'News' , 'journal' : 'News' } for i in reversed(range(len(tab)/2)): x = tab[i*2] y = tab[i*2+1] if x == 'title': for j in genre: if y.lower().find(j) >= 0: tab[i*2+2:i*2+2] = [ genre[j] ] tab[i*2+2:i*2+2] = [ 'category' ] return tab def textdecode(self,s, html=True, encoding='iso8859-1',replacecr=True): pcr = re.compile(r'^\s*|\s*$', re.UNICODE) pcr1 = re.compile(r'\n|\r', re.UNICODE) pspace = re.compile(r'\s+', re.UNICODE) p = re.compile('&(%s);' % '|'.join(htmlentitydefs.name2codepoint), re.UNICODE) s = unicode(unicode(s, encoding).encode('utf-8'),'utf-8') #Beautiful one-liner I found to replace html character code into ISO-8859-1 if html: s = p.sub(lambda m: unichr(htmlentitydefs.name2codepoint[m.group(1)]), s) if replacecr: s = pcr1.sub(' ', s) s = pspace.sub(' ', s) #Remove leading and trailing space and CR return pcr.sub('', s) def addchannel(self, channel_name, channel_id, lang=None): # Create element channel_xml = self.doc.createElement("channel") channel_xml.setAttribute("id",channel_id) # Create element displayname_xml = self.doc.createElement("display-name") if lang: displayname_xml.setAttribute("lang",lang) channel_xml.appendChild(displayname_xml) ptext = self.doc.createTextNode(channel_name) displayname_xml.appendChild(ptext) self.tv_xml.appendChild(channel_xml) def addqueueprogramme(self, processed, channelid, offset=0, days=8, dst=0, timezone=0, startdate=None, marker='time', encoding='iso8859-1'): doc = self.doc tv_xml = self.tv_xml channelname = channelid monthvalue = { 'janvier' : 1 , 'fevrier' : 2, 'mars' : 3 , 'avril' : 4, 'mai' : 5, 'juin' : 6, 'juillet' : 7, 'aout' : 8, 'septembre' : 9, 'octobre' : 10, 'novembre' : 11, 'decembre' : 12, 'january' : 1 , 'february' : 2, 'march' : 3 , 'april' : 4, 'may' : 5, 'june' : 6, 'july' : 7, 'august' : 8, 'september' : 9, 'october' : 10, 'november' : 11, 'december' : 12, 'jan' : 1 , 'feb' : 2, 'mar' : 3 , 'apr' : 4, 'may' : 5, 'jun' : 6, 'jul' : 7, 'aug' : 8, 'sep' : 9, 'oct' : 10, 'nov' : 11, 'dec' : 12 } month_day = [ 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] #Calculate date wintertime and date summertime scannedprogramme = 0 timezoneori = timezone lastsundayw = monthwinter * 100 + 31 - datetime.datetime(datetime.datetime.today( ).year, monthwinter, 31).isoweekday( ) % 7 lastsundays = monthsummer * 100 + 31 - datetime.datetime(datetime.datetime.today( ).year, monthsummer, 31).isoweekday( ) % 7 #Hack to handle TV5 incorrect time change for 2008. changetimetoday = 0 #Build min and max time based on time offset and max days #If offset is 0, retrieve all the content available from the source delta = datetime.timedelta(days=offset) #Pytz has inverted timezone tz = pytz.timezone('Etc/GMT%+d' % ( -timezone / 100)) mindate = datetime.datetime.now(tz=tz) + delta maxdate = mindate + datetime.timedelta(days=days) mindate = mindate.year * 10000000000 + mindate.month * 100000000 + mindate.day * 1000000 maxdate = maxdate.year * 10000000000 + maxdate.month * 100000000 + maxdate.day * 1000000 marker = { 'date':1 , 'time':2, 'title':3, 'category':4, 'subtitle':5, 'description':6 }[marker] state = ignore = 0 if not startdate: currentdate = datetime.datetime.now(tz=tz) currentdate = datetime.datetime(currentdate.year, currentdate.month, currentdate.day) #Create a stack of two programmes plist = [] plist.append({}) plist[0]['time'] = currentdate plist[0]['tz'] = timezoneori plist[0]['title'] = '' plist[0]['subtitle'] = '' plist[0]['category'] = '' plist[0]['description'] = '' plist.append(copy.deepcopy(plist[0])) title = category = '' subtitle = [] description = [] reempty = re.compile(r'\s*$') #different possible states: # 1 = date # 2 = time # 3 = title # 4 = category # 5 = subtitle # 6 = description for loop in range(len(processed) / 2): x = processed[loop*2] if x == 'date': state = 1 #Handle the date elif x == 'time': state = 2 elif x == 'title': state = 3 elif x == 'category': state = 4 elif x == 'subtitle': state = 5 elif x == 'desc': state = 6 #Not a valid field, just skip it else: continue x = processed[loop*2+1] if verbose > 1: print 'x' , x , 'state', state, 'ignore', ignore #If we got a new marker, time to process the programme if state == marker: if verbose > 1: print 'processing new programme' if not ignore: #New entry, reset the ignore flag #Calculate the DST value timezone = timezoneori plist[0] = copy.deepcopy(plist[1]) plist[1]['time'] = currentdate plist[1]['tz'] = timezone plist[1]['title'] = title plist[1]['subtitle'] = '' for i in subtitle: plist[1]['subtitle'] += i + ' ' plist[1]['category'] = category plist[1]['description'] = '' for i in description: plist[1]['description'] += i + ' ' title = category = '' description = [] subtitle = [] if verbose > 1: print 'scanned programme', scannedprogramme if verbose > 2: print 'plist=', plist if scannedprogramme > 0 and verbose > 0: print 'date = ' + `plist[1]['time'].day` + ' title = ' + plist[1]['title'] + ' time: %02d:%02d %+05d' % (plist[1]['time'].hour,plist[1]['time'].minute, plist[1]['tz']) if scannedprogramme >= 2: #We're starting a new program, so print the previous one if it exist startdate = plist[0]['time'].year * 10000000000 + plist[0]['time'].month * 100000000 + plist[0]['time'].day * 1000000 + plist[0]['time'].hour * 10000 + plist[0]['time'].minute * 100 stopdate = plist[1]['time'].year * 10000000000 + plist[1]['time'].month * 100000000 + plist[1]['time'].day * 1000000 + plist[1]['time'].hour * 10000 + plist[1]['time'].minute * 100 if startdate >= maxdate: break if startdate >= mindate: startstr = '%014d' % startdate + ' %+05d' % plist[0]['tz'] stopstr = '%014d' % stopdate + ' %+05d' % plist[1]['tz'] if verbose > 0: print 'start=%s' % startstr + ' stop=%s' % stopstr programme = doc.createElement("programme") programme.setAttribute("channel", channelname) programme.setAttribute("start", startstr) programme.setAttribute("stop", stopstr) tv_xml.appendChild(programme) if plist[0]['title'] != '': # Create a element title_xml = doc.createElement("title") programme.appendChild(title_xml) # Give the <title> element some text ptext = doc.createTextNode(self.textdecode(plist[0]['title'],encoding=encoding,replacecr=True)) title_xml.appendChild(ptext) else: errorstr = 'Error: title must not be empty: %s - %s' % (channelname,startstr) errorstr += "\nIt's likely you are getting this error as the source page have changed its layout. Please contact the author to let him know" raise Error(errorstr,300) if plist[0]['subtitle'] != '': text = self.textdecode(plist[0]['subtitle'],encoding=encoding,replacecr=True) if text != '': ptext = doc.createTextNode(text) # Create a <sub-title> element subtitle_xml = doc.createElement("sub-title") programme.appendChild(subtitle_xml) subtitle_xml.appendChild(ptext) if plist[0]['description'] != '': text = self.textdecode(plist[0]['description'],encoding=encoding,replacecr=True) if text != '': ptext = doc.createTextNode(text) # Create a <desc> element desc_xml = doc.createElement("desc") programme.appendChild(desc_xml) # Give the <desc> element some text desc_xml.appendChild(ptext) if plist[0]['category'] != '': # Create a <category> element category_xml = doc.createElement("category") programme.appendChild(category_xml) # Give the <desc> elemenet some text plist[0]['category'] = string.replace(plist[0]['category'],' [','') plist[0]['category'] = string.replace(plist[0]['category'],']','') ptext = doc.createTextNode(self.textdecode(plist[0]['category'],encoding=encoding,replacecr=True)) category_xml.appendChild(ptext) scannedprogramme += 1 else: ignore = 0 if state == 1: m = re.compile(r'\s*\S*?\s*([0-9]+) (\S+?)[,\s]*([0-9]+)|([0-9]+)/\s*([0-9]+)/([0-9]+)|^([a-zA-Z])+$').match(x) if m: if m.group(5): currentdate_tmp = datetime.datetime(int(m.group(6)),int(m.group(5)),int(m.group(4))) elif m.group(2) and monthvalue.has_key(m.group(2).lower()): currentdate_tmp = datetime.datetime(int(m.group(3)),monthvalue[m.group(2).lower()],int(m.group(1))) else: raise Error('%s: date is not valid (received %s)' % (channelid, x),301) #If we have already calculated the new date, no need to replace as we would loose the time otherwise if verbose > 1: print 'date field', currentdate, currentdate_tmp if not (currentdate_tmp.year == currentdate.year and currentdate_tmp.month == currentdate.month and currentdate_tmp.day == currentdate.day): currentdate = currentdate_tmp #Calculate DST timezone = timezoneori else: raise Error('%s: date is not valid (received %s)' % (channelid, x),302) elif state == 2: m = re.compile(r'( |\s)*([0-2]?[0-9])[:\.]?([0-9]{2})\s*(AM|PM|am|pm)?', re.UNICODE).match(x) if m: hour = int(m.group(2)) minute = int(m.group(3)) if hour == 24: hour = 0 if m.group(4): #time is 12AM -> 00:00 if m.group(4).lower() == 'am' and hour == 12: hour = 0 #time is xxPM -> add 12 hours elif m.group(4).lower() == 'pm' and hour < 12: hour += 12 currentdate = datetime.datetime(currentdate.year, currentdate.month, currentdate.day, hour, minute) if hour < plist[1]['time'].hour and currentdate.day == plist[1]['time'].day and currentdate.month == plist[1]['time'].month: delta = datetime.timedelta(days=1) currentdate += delta else: raise Error('Invalid time for %s (got: %s) in %s' % (channelid,repr(x),repr(currentdate)) ,303) elif state == 3: #New title, process the previous entry and add it to the xml tree #Check if TV5 just changed the time if re.compile(r"changement d\'heure").match(x.lower()): if verbose > 1: print 'Change DST' ignore = 1 if dst: #Set changetimetoday if DST is scheduled to happen today changetimetoday = 1 else: title = x elif state == 4 and not ignore: category = x elif state == 5 and not ignore: subtitle.append(x) elif state == 6 and not ignore: description.append(x) state = 0 #Finish last entry if verbose > 1: print 'processing new programme' if not ignore: #New entry, reset the ignore flag #Calculate the DST value timezone = timezoneori plist[0] = copy.deepcopy(plist[1]) plist[1]['time'] = currentdate plist[1]['tz'] = timezone plist[1]['title'] = title plist[1]['subtitle'] = '' for i in subtitle: plist[1]['subtitle'] += i + ' ' plist[1]['category'] = category plist[1]['description'] = '' for i in description: plist[1]['description'] += i + ' ' title = category = '' description = [] subtitle = [] if verbose > 1: print 'scanned programme', scannedprogramme if verbose > 2: print 'plist=', plist if scannedprogramme > 0 and verbose > 0: print 'date = ' + `plist[1]['time'].day` + ' title = ' + plist[1]['title'] + ' time: %02d:%02d %+05d' % (plist[1]['time'].hour,plist[1]['time'].minute, plist[1]['tz']) if scannedprogramme >= 2: #We're starting a new program, so print the previous one if it exist startdate = plist[0]['time'].year * 10000000000 + plist[0]['time'].month * 100000000 + plist[0]['time'].day * 1000000 + plist[0]['time'].hour * 10000 + plist[0]['time'].minute * 100 stopdate = plist[1]['time'].year * 10000000000 + plist[1]['time'].month * 100000000 + plist[1]['time'].day * 1000000 + plist[1]['time'].hour * 10000 + plist[1]['time'].minute * 100 if startdate >= maxdate: return if startdate >= mindate: startstr = '%014d' % startdate + ' %+05d' % plist[0]['tz'] stopstr = '%014d' % stopdate + ' %+05d' % plist[1]['tz'] if verbose > 0: print 'start=%s' % startstr + ' stop=%s' % stopstr programme = doc.createElement("programme") programme.setAttribute("channel", channelname) programme.setAttribute("start", startstr) programme.setAttribute("stop", stopstr) tv_xml.appendChild(programme) if plist[0]['title'] != '': # Create a <title> element title_xml = doc.createElement("title") programme.appendChild(title_xml) # Give the <title> element some text ptext = doc.createTextNode(self.textdecode(plist[0]['title'],encoding=encoding,replacecr=True)) title_xml.appendChild(ptext) else: errorstr = 'Error: title must not be empty: %s - %s' % (channelname,startstr) errorstr += "\nIt's likely you are getting this error as the source page have changed its layout. Please contact the author to let him know" raise Error(errorstr,300) if plist[0]['subtitle'] != '': text = self.textdecode(plist[0]['subtitle'],encoding=encoding,replacecr=True) if text != '': ptext = doc.createTextNode(text) # Create a <sub-title> element subtitle_xml = doc.createElement("sub-title") programme.appendChild(subtitle_xml) subtitle_xml.appendChild(ptext) if plist[0]['description'] != '': text = self.textdecode(plist[0]['description'],encoding=encoding,replacecr=True) if text != '': ptext = doc.createTextNode(text) # Create a <desc> element desc_xml = doc.createElement("desc") programme.appendChild(desc_xml) # Give the <desc> element some text desc_xml.appendChild(ptext) if plist[0]['category'] != '': # Create a <category> element category_xml = doc.createElement("category") programme.appendChild(category_xml) # Give the <desc> elemenet some text plist[0]['category'] = string.replace(plist[0]['category'],' [','') plist[0]['category'] = string.replace(plist[0]['category'],']','') ptext = doc.createTextNode(self.textdecode(plist[0]['category'],encoding=encoding,replacecr=True)) category_xml.appendChild(ptext) scannedprogramme += 1 def toxml(self,docxml=False): if docxml: return docxml.toxml(encoding="utf-8") else: return self.doc.toxml(encoding="utf-8") def toprettyxml(self,docxml=False): if docxml: return docxml.toprettyxml(indent=" ",encoding="utf-8") else: return self.doc.toprettyxml(indent=" ",encoding="utf-8") def calc_timezone(date): d = date.utcoffset().seconds / 36 if d > 1200: d -= 2400 return d class TVGRAB: def time_title(self,result): #format is XX:XX Title, separate time and title p = re.compile(r'\s*\[?([0-9]{1,2}(:|\.)[0-9]{2})\]?\s*(.*?)\s*$', re.UNICODE) result2 = [] ignore = False for i in range(len(result)/2): x = result[i*2] y = result[i*2+1] if verbose > 1: print 'x=',x,'y=',repr(y) if x == 'time': m = p.match(y) if m: result2.append('time') result2.append(m.group(1)) result2.append('title') result2.append(m.group(3)) ignore = False else: ignore = True elif not ignore: result2.append(x) result2.append(y) elif verbose > 1: print 'ignoring' return result2 def main(self, conf, output=sys.stdout, outfilename=False, reuse=False, pretty=False, converttz=False, timezone=0000,maxdays=7,offset=0): finishtemplate = { 'id': False, 'name': False, 'timezone': False, 'dst': 0, 'result': False, 'lang': False , 'marker': 'time', 'encoding': 'iso8859-1'} finishtab= [] output2 = output xmltv = XMLTV() if reuse: if outfilename: output2 = open(outfilename,'w') if converttz: doc = xmltv.adjusttimezone(reuse.doc,timezone) if pretty: output2.write(xmltv.toprettyxml(docxml=doc)) else: output2.write(xmltv.toxml(docxml=doc)) else: doc = reuse.doc if pretty: output2.write(reuse.toprettyxml()) else: output2.write(reuse.toxml()) if outfilename: output2.close() return doc #Process Al Jazeera data if grab_channel['aljazeera']: if verbose > 0: print 'Processing Al JAzeera' try: handle = Web(ajurl) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: # Description of Radio Times data fields (23 in total) (extracted from tv_grab_uk_rt: # # title - the programme title (text) # sub_title - infrequently defined - preference is given to episode # if defined (text) # episode - the name of a particular episode of the programme and/or # the episode's position in the current series (text) # year - the year of production (text) # director - the programme's director(s) (text) # cast - the programme's cast (may include character details) (text) # premiere - whether this is a film's first showing (boolean) # film - whether the programme is a film (boolean) # repeat - whether the programme has been shown before (boolean) # subtitles - whether subtitles are available (boolean) # widescreen - whether the broadcast is 16:9 widescreen (boolean) # new_series - whether the programme is the first episode in a # series new (boolean) # deaf_signed - whether in-vision signing is available (boolean) # blank_and_white - whether the broadcast is not in colour (boolean) # star_rating - a star rating between 0 and 5 for films (text) # certificate - the BBFC certificate for the programme (text) # genre - the genre of the programme (text) # desc - a description of the programme. Can be a specific review by a # Radio Times reviewer (text) # choice - whether the programme is recommended by the # Radio Times (boolean) # date - the transmission date (text) # start - the transmission start time for the programme (text) # stop - the transmissions stop time for the programme (text) # duration_mins - the duration of the programme in minutes (text) resultaj = [] # programmeformat = [ 'title', 'subtitle','episode', 'year', 'director', 'cast', 'premiere', 'film', 'repeat', 'subtitles', 'widescreen', # 'new_series', 'deaf_signed', 'colour', 'rating', 'certificate', 'genre', 'desc', 'choice', 'date', 'start', 'stop', 'duration' ] programmeformat = [ 'title', '','subtitle', '', '', '', '', '', '', '', '', '', '', '', '', '', 'category', 'desc', '', 'date', 'time', '', '' ] tz = pytz.timezone('Europe/London') date = datetime.datetime.now(tz=tz) #Rebuild list of programme item for XMLTV class while True: x = handle.readline() if not x: break x = string.replace(x,'\r','') x = string.replace(x,'\n','') tab = re.compile(r"~").split(x) if len(tab) == 23: for i in range(len(programmeformat)): if tab[i] and programmeformat[i]: #If no category, ignore if not (programmeformat[i] == 'category' and tab[i] == 'No Genre'): resultaj.append(programmeformat[i]) resultaj.append(tab[i]) else: if verbose > 0: print 'incorrect programme field, only %d fields' % len(tab) if verbose > 0: print 'Done grabbing data.. processing' resultaj = xmltv.scangenre(resultaj) if not resultaj: print >> sys.stderr, "Couldn't extract any content from Al Jazeera" #raise Error("Couldn't extract any content from Al Jazeera",401) else: #Add it to the processed table finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = ajidname finishtab[-1]['result'] = resultaj finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = ajchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = 0 finishtab[-1]['marker'] = 'date' else: if verbose > 0: print 'Ignoring Al Jazeera' if grab_channel['bloomberg']: if verbose > 0: print 'Processing Bloomberg' #Calculate timezone tz = pytz.timezone('Asia/Hong_Kong') date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) weekday = date.isoweekday() % 7 #Read 7 days maximum worth of programme days = min(7-offset, maxdays+1) parser = Scraper(statebloomberg) day_tab = [ 'sunday', 'monday' , 'tuesday' , 'wednesday', 'thursday', 'friday', 'saturday' ] for i in range(days): url = '%s_%s.html' % ( bloombergurl, day_tab[weekday]) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') weekday = (weekday + 1) % 7 result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from Bloomberg" #raise Error("Couldn't extract any content from Bloomberg",412) else: result[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] result = xmltv.scangenre(result) finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = bloombergidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = bloombergchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Bloomberg' if grab_channel['newsasia']: if verbose > 0: print 'Processing NewsAsia' #Calculate timezone in Singapore tz = pytz.timezone('Singapore') date = datetime.datetime.now(tz=tz) #Read 8 days maximum worth of programme days = min(8, maxdays+1) parser = Scraper(statenewsasia) for i in range(days): url = '%s?day=%d' % ( newsasiaurl , i + offset - 1) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') newsasiaresult = parser.processed parser.close() if not newsasiaresult: print >> sys.stderr, "Couldn't extract any content from NewsAsia" #raise Error("Couldn't extract any content from NewsAsia",406) else: finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = newsasiaidname finishtab[-1]['result'] = newsasiaresult finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = newsasiachannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring NewsAsia' if grab_channel['worldfashion']: if verbose > 0: print 'Processing World of Fashion' tz = pytz.timezone('Europe/Moscow') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset-1) #Read 3 days maximum worth of programme, put maxdays+1 so we can get info about the last program of the day days = min(4-offset, maxdays+1) parser = Scraper(statewof) for i in range(days): url = '%s?date=%02d.%02d.%04d' % (wofurl, date.day, date.month, date.year) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from World of Fashion" #raise Error("Couldn't extract any content from World of Fashion",415) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = wofidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = wofchannelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring World of Fashion' if grab_channel['eurosport']: if verbose > 0: print 'Processing Eurosport' #Calculate timezone tz = pytz.timezone('Etc/GMT') offset2 = offset if offset > 0: offset2 -= 1 date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset2) #Read 8 days maximum worth of programme days = min(8-offset, maxdays+1) parser = Scraper(stateeurosport) for i in range(days): url = '%s_day%d.shtml' % ( eurosporturl, i+offset) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() #Search for "Eurosport World", as this is where the programme starts start = data.find('alt="Eurosport World"') if start > 0: data = '<img ' + data[start:] parser.feed(data,ignoretag='br') eurosportresult = parser.processed parser.close() if not eurosportresult: print >> sys.stderr, "Couldn't extract any content from Eurosport" #raise Error("Couldn't extract any content from NewsAsia",411) else: eurosportresult[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] eurosportresult = xmltv.scangenre(eurosportresult) finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = eurosportidname finishtab[-1]['result'] = eurosportresult finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = eurosportchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Eurosport' if grab_channel['eurosportnews']: if verbose > 0: print 'Processing Eurosport News' #Calculate timezone tz = pytz.timezone('Australia/Melbourne') date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Generate 8 days maximum worth of programme days = min(8, maxdays+1) result = [] for i in range(days): for j in range(24): result.append('time') result.append('%02d:00' % (j%24)) result.append('title') result.append('Eurosport News') result.append('desc') result.append('Interactive sports programme, with rolling sports news and updates every 15 minutes.') result[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] finishtab.append(copy.deepcopy(finishtemplate)) result = xmltv.scangenre(result) finishtab[-1]['id'] = eurosportnewsidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = eurosportnewschannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Eurosport News' if grab_channel['dw']: if verbose > 0: print 'Processing DW' #Generate URL tz = pytz.timezone('UTC') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Read web site data = '' parser = Scraper(statedw,completeclass=True) #DW only has 7 days worth of data from today days = min(7-offset, maxdays+1) for i in range(days): url = '%s?wday=%d&sprache=gb&schiene=dwtvasien&to=0' % (dwurl, date.isoweekday()) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from DW" #raise Error("Couldn't extract any content from DW",404) else: result = xmltv.scangenre(result) result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = dwidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'dw' finishtab[-1]['name'] = dwchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring DW' if grab_channel['trtint']: if verbose > 0: print 'Processing TRT International' #Generate URL tz = pytz.timezone('Asia/Istanbul') offset2 = offset if offset > 0: offset2 -= 1 date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset2) #Read web site data = '' parser = Scraper(statetrt,completeclass=True) #TRT only has 7 days worth of data from today days = min(7-offset, maxdays+1) counter = 1 for i in range(days): url = '%s?gunler=%d&kanal=5&akistur=1&tdgun=%d&control=0' % (trturl, date.isoweekday() % 7, counter+offset2) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) counter += 1 result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from TRT International" #raise Error("Couldn't extract any content from DW",404) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = trtidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'tr' finishtab[-1]['name'] = trtchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring TRT International' #Process lyngsat channels #lyngsat format: 0: id, 1: url, 2: name, 3: language, 5: timezone , 5: dst for x in lyngsat: if grab_channel[x[0]]: if verbose > 0: print 'Processing ' + x[0] #Read web site tz = pytz.timezone('Europe/Paris') date = datetime.datetime.now(tz=tz) url = x[1] + '?offset=0' if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() url = x[1] + '?offset=1' if verbose > 0: print 'Reading url=%s' % url handle = Web(url) data = data + handle.read() handle.close() if verbose > 0: print 'Done grabbing data.. processing' #Extract HTML parser = Scraper(statelyngsat) parser.feed(data,ignoretag='br') resultlyngsat = parser.processed parser.close() #Add automatic category based on title resultlyngsat = xmltv.scangenre(resultlyngsat) if not resultlyngsat: print >> sys.stderr, "Couldn't extract any content from %s" % x[0] #raise Error("Couldn't extract any content from %s" % x[0], 403) else: resultlyngsat[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] #Add it to the processed table finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = x[0] finishtab[-1]['result'] = resultlyngsat finishtab[-1]['lang'] = x[3] finishtab[-1]['name'] = x[2] finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = x[5] else: if verbose > 0: print 'Ignoring ' + x[0] #Process TVE if grab_channel['tve']: if verbose > 0: print 'Processing TVE' tz = pytz.timezone('Europe/Madrid') #TVE page starts at 6AM for each day, so start fetching the previous day too date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset - 1) #Grab up to 8 days worth of data j = min(8-offset, maxdays+1) data = '' parser = Scraper(statetve) for i in range(j): x = '%02d%02d' % (date.day, date.month) url = tveurl + x + '.htm' if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() parser.feed(data,emptyfield=False,ignoretag='br') handle.close() date += datetime.timedelta(days=1) resulttve = parser.processed parser.close() #TVE has shocking html, so purely extract all possible text and run regular expression to extract the time and title textextract = '' for i in range (len(resulttve)/2): x = resulttve[i*2] y = resulttve[i*2+1] if x == 'title': textextract += '\n' + y #Remove date field elif x == 'date': continue else: textextract += ' ' + y p = re.compile(r'\s*([0-9]{1,2}:[0-9]{2})(:[0-9]{2})?\s*(-|–)*\s*(.*)\s*?([\s\S]*?)(\s| )*([0-9]{1,2}:[0-9]{2}(:[0-9]{2})?\s*(-|–)*\s*[\s\S]*)') resulttve2 = [] finished = False while not finished: m = p.search(textextract) if m: if verbose > 2: print 'group', m.groups() resulttve2.append('time') resulttve2.append(m.group(1)) resulttve2.append('title') resulttve2.append(m.group(4)) description = m.group(5) if description: resulttve2.append('desc') resulttve2.append(description) textextract = m.group(7) else: finished = True #Retrieved last entry p = re.compile(r'\s*([0-9]{1,2}:[0-9]{2})(:[0-9]{2})?\s*(-|–)*\s*(.*)\s*?([\s\S]*?)') m = p.search(textextract) if m: if verbose > 2: print m.groups() resulttve2.append('time') resulttve2.append(m.group(1)) resulttve2.append('title') resulttve2.append(m.group(4)) description = m.group(5) if description: resulttve2.append('desc') resulttve2.append(description) if not resulttve2: print >> sys.stderr, "Couldn't extract any content from TVE" #raise Error("Couldn't extract any content from TVE",402) else: #Add date resulttve2[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] #Add it to the processed table finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = tveidname finishtab[-1]['result'] = resulttve2 finishtab[-1]['lang'] = 'es' finishtab[-1]['name'] = tvechannelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring TVE' if grab_channel['cctv']: if verbose > 0: print 'Processing CCTV' #Generate URL tz = pytz.timezone('Asia/Shanghai') for channel in cctvchannels: date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) parser = Scraper(statecctv,completeclass=True) days = min(8-offset, maxdays+1) for i in range(days): url = '%s%s/%02d/%04d%02d%02d.shtml' % (googletranslatechinese, cctvurl, channel['number'], date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from CCTV" #raise Error("Couldn't extract any content from CCTV",415) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] result = xmltv.scangenre(result) finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = channel['id'] finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'cn' finishtab[-1]['name'] = channel['name'] finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 finishtab[-1]['encoding'] = 'utf-8' else: if verbose > 0: print 'Ignoring CCTV' if grab_channel['cuba']: if verbose > 0: print 'Processing Cuba Vision' #Generate URL #Cuba Vison URL in in the format 'url'?Dia=x #Calculator today's day, Cuba time (GMT -4) 0->Sunday, 1->Monday etc... tz = pytz.timezone('Cuba') date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) weekday = date.isoweekday() % 7 #Read web site data = '' parser = Scraper(statecuba) #Cuba only has 7 days worth of data from today days = min(7-offset, maxdays+1) for i in range(days): if verbose > 0: print 'Reading url=%s?Dia=%d' % (cubaurl,weekday) try: handle = Web(cubaurl + '?Dia=' + `weekday`) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') weekday = (weekday + 1) % 7 cubaresult = parser.processed parser.close() if not cubaresult: print >> sys.stderr, "Couldn't extract any content from Cuba" #raise Error("Couldn't extract any content from Cuba",404) else: cubaresult[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = cubaidname finishtab[-1]['result'] = cubaresult finishtab[-1]['lang'] = 'es' finishtab[-1]['name'] = cubachannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Cuba Vision' if grab_channel['bvn']: if verbose > 0: print 'Processing Dutch TV' tz = pytz.timezone('UTC') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Read 7 days maximum worth of programme, put maxdays+1 so we can get info about the last program of the day days = min(7, maxdays+1) result = [] p = re.compile(r"{\s*title:\s*'(?P<title>.*?)',\s*time:\s*'(?P<time>.*?)',\s*description:\s*'(?P<desc>.*?)(<a.*>(?P<desc2>.*?)</a>(?P<desc3>.*?))*',",re.UNICODE) #Calculate the day of the year for i in range(days): url = '%s?date=%04d%02d%02d' % (bvnurl, date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() m = re.compile(r"var programs = new Array\((.*)\);",re.DOTALL).search(data) if m: token = p.finditer(m.group(1)) for j in token: result.append('time') result.append(j.group('time')) result.append('title') result.append(j.group('title')) result.append('desc') result.append(j.group('desc') + ' ' + (j.group('desc2') != None and j.group('desc2') or '') + ' ' + (j.group('desc3') != None and j.group('desc3') or '') ) else: break date += datetime.timedelta(days=1) result = xmltv.scangenre(result) if not result: print >> sys.stderr, "Couldn't extract any content from Dutch TV" #raise Error("Couldn't extract any content from Dutch TV",408) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = bvnidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'nl' finishtab[-1]['name'] = bvnchannelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 finishtab[-1]['encoding'] = 'utf-8' else: if verbose > 0: print 'Ignoring Dutch TV' if grab_channel['ertworld']: if verbose > 0: print 'Processing ERT World' tz = pytz.timezone('Europe/Athens') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset-1) #Read 7 days maximum worth of programme, put maxdays+1 so we can get info about the last program of the day days = min(7, maxdays+1) parser = Scraper(stateertworld) #Calculate the day of the year for i in range(days): url = '%s?pday=%d' % (ertworldurl, (date - datetime.datetime(date.year,1,1,tzinfo=tz)).days + 1) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,emptyfield=False) date += datetime.timedelta(days=1) ertworldresult = xmltv.scangenre(self.time_title(parser.processed)) parser.close() if not ertworldresult: print >> sys.stderr, "Couldn't extract any content from ERT World" #raise Error("Couldn't extract any content from ERT World",408) else: ertworldresult[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = ertworldidname finishtab[-1]['result'] = ertworldresult finishtab[-1]['lang'] = 'gr' finishtab[-1]['name'] = ertworldchannelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 finishtab[-1]['encoding'] = 'windows-1253' else: if verbose > 0: print 'Ignoring ERT World' if grab_channel['duna']: if verbose > 0: print 'Processing Duna' #Generate URL tz = pytz.timezone('Europe/Budapest') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset-1) #Read web site parser = Scraper(stateduna,completeclass=True) #TRT only has 7 days worth of data from today days = min(7-offset, maxdays+1) counter = 1 for i in range(days): url = '%s?nap=%04d-%02d-%02d&channel=4284' % (dunaurl, date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br|p|b') date += datetime.timedelta(days=1) result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from Duna" #raise Error("Couldn't extract any content from Duna",404) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = dunaidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'hu' finishtab[-1]['name'] = dunachannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Duna' if grab_channel['tvri']: if verbose > 0: print 'Processing TVRI' tz = pytz.timezone('Asia/Jakarta') date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Only today's data parser = Scraper(statetvri) url = tvriurl if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') result = parser.processed parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from TVRI" #raise Error("Couldn't extract any content from VTV4",414) else: result[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = tvriidname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'id' finishtab[-1]['name'] = tvrichannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring TVRI' if grab_channel['rtm']: if verbose > 0: print 'Processing Malaysian TV' #Calculate timezone in Kuala Lumpur tz = pytz.timezone('Asia/Kuala_Lumpur') date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset-1) #Read 8 days maximum worth of programme days = min(8-offset, maxdays+1) parser = Scraper(statertm) for i in range(days): url = '%s?date=%04d-%02d-%02d' % ( rtmurl , date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) rtmresult = parser.processed parser.close() if not rtmresult: print >> sys.stderr, "Couldn't extract any content from NewsAsia" #raise Error("Couldn't extract any content from Malaysian TV",409) else: finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = rtmidname finishtab[-1]['result'] = rtmresult finishtab[-1]['lang'] = 'my' finishtab[-1]['name'] = rtmchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Malaysian TV' if grab_channel['rt']: if verbose > 0: print 'Processing Russia Today' tz = pytz.timezone('UTC') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Read 7 days maximum worth of programme, put maxdays+1 so we can get info about the last program of the day days = min(7-offset, maxdays+1) parser = Scraper(staterussia) for i in range(days): url = '%s/%02d-%02d-%02d?tz=0' % (russiaurl, date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data) date += datetime.timedelta(days=1) russiaresult = xmltv.scangenre(self.time_title(parser.processed)) parser.close() if not russiaresult: print >> sys.stderr, "Couldn't extract any content from Russia Today" #raise Error("Couldn't extract any content from Russia Today",407) else: russiaresult[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = russiaidname finishtab[-1]['result'] = russiaresult finishtab[-1]['lang'] = 'ru' finishtab[-1]['name'] = russiachannelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Russia Today' if grab_channel['sctv']: if verbose > 0: print 'Processing SCTV' #Program on SCTV starts on Monday, so calculate the date of last Monday tz = pytz.timezone('Australia/Sydney') date = datetime.datetime.now(tz=tz) - datetime.timedelta(days=datetime.datetime.now(tz=tz).weekday()) #SCTV has shocking html, so purely extract all possible text and run regular expression to extract the time and title parser = Scraper(statesctv) try: handle = Web(sctvurl) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,emptyfield=False) sctvresult = parser.processed parser.close() textextract = '' for i in range (len(sctvresult)/2): if sctvresult[i*2] == 'title': textextract += sctvresult[i*2+1] + '\n' p = re.compile(r'([0-9]{2}:[0-9]{2}[ap]m)\s+-\s+(.*)\s+([0-9]{2}:[0-9]{2}[ap]m\s+-\s+[\s\S]*)') sctvresult2 = [] finished = False while not finished: m = p.search(textextract) if m: sctvresult2.append('time') sctvresult2.append(m.group(1)) sctvresult2.append('title') sctvresult2.append(m.group(2)) textextract = m.group(3) else: finished = True #Retrieved last entry p = re.compile(r'([0-9]{2}:[0-9]{2}[ap]m)\s+-\s+(.*)') m = p.search(textextract) if m: sctvresult2.append('time') sctvresult2.append(m.group(1)) sctvresult2.append('title') sctvresult2.append(m.group(2)) if not sctvresult2: print >> sys.stderr, "Couldn't extract any content from SCTV" #raise Error("Couldn't extract any content from SCTV",405) else: sctvresult2[:0] = ['date','%02d/%02d/%04d' % (date.day,date.month,date.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = sctvidname finishtab[-1]['result'] = sctvresult2 finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = sctvchannelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring SCTV' if grab_channel['tv5']: #Process TV5 data if verbose > 0: print 'Processing TV5' #Set cookie for TV5 cj = cookielib.CookieJar() for i in cookietv5: if conf.config.has_key(cookietv5[i]): cookietv5[i] = `time_zone[conf.config[cookietv5[i]]][2]` for i in cookietv5: cookie = cookielib.Cookie(0,i,cookietv5[i],None,False,tv5mainurl,True,True,'/', True, False, maxtv5cookie, False, None, None, {} ) cj.set_cookie(cookie) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) try: handle = Web(tv5url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() if verbose > 0: print 'Done grabbing data.. processing' tz = pytz.timezone('Australia/Melbourne') date = datetime.datetime.now(tz=tz) parser = Scraper(statetv5) parser.feed(data,ignoretag='br') if not parser.processed: print >> sys.stderr, "Couldn't extract any content from TV5" #raise Error("Couldn't extract any content from TV5",400) else: #Add it to the processed table finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = tv5idname finishtab[-1]['result'] = parser.processed finishtab[-1]['lang'] = 'fr' finishtab[-1]['name'] = tv5channelname finishtab[-1]['timezone'] = calc_timezone(date) finishtab[-1]['dst'] = dst parser.close() else: if verbose > 0: print 'Ignoring TV5' if grab_channel['voa']: if verbose > 0: print 'Processing Voice of America' tz = pytz.timezone('UTC') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Create parser instance parser = Scraper(statevoa) #Read 8 days maximum worth of programme days = min(8, maxdays+1) for i in range(days): data = { 'requestdate' : '%02d%02d%02d' % (date.month, date.day, date.year - 2000), 'satellite' : 'AS', 'type' : 'full'} try: handle = Web(voaurl, data=data) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() #Feed the data to the parser parser.feed(data,emptyfield=False,ignoretag='br') date += datetime.timedelta(days=1) voaresult = parser.processed parser.close() if not voaresult: print >> sys.stderr, "Couldn't extract any content from Voice of America" #raise Error("Couldn't extract any content from Voice of America",410) else: voaresult2 = [] status = 0 for i in range(len(voaresult)/2): x = voaresult[i*2] y = voaresult[i*2+1] if i*2+3 < len(voaresult): x2 = voaresult[i*2+2] y2 = voaresult[i*2+3] if i*2+5 < len(voaresult): x3 = voaresult[i*2+4] y3 = voaresult[i*2+5] else: x3 = y3 = '' if verbose > 1: print 'x',x,'y',y,'x2',x2,'y2',y2 if x == 'data' and x2 == 'data2': if y == 'Time:': voaresult2.append('time') voaresult2.append(re.compile(r'\s*([0-9]{2}:[0-9]{2})').match(y2).group(1)) status = 0 elif y == 'Title:': voaresult2.append('title') voaresult2.append(y2) elif y == 'Episode:': voaresult2.append('subtitle') voaresult2.append(y2) elif y == 'Language:': status = 1 lang = y2 elif y == 'Description:': voaresult2.append('desc') if status == 1: voaresult2.append(y2 + '\nLanguage: ' + lang) else: voaresult2.append(y2) elif x == 'title' and x2 == 'title2' and x3 == 'title3': voaresult2.append('title') voaresult2.append(y2+y3) elif x == 'title' and x2 == 'title2': voaresult2.append('title') voaresult2.append(y2) voaresult2[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = voaidname finishtab[-1]['result'] = voaresult2 finishtab[-1]['lang'] = 'en' finishtab[-1]['name'] = voachannelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['dst'] = 0 finishtab[-1]['encoding'] = 'utf-8' else: if verbose > 0: print 'Ignoring Voice of America' if grab_channel['thaitv5']: if verbose > 0: print 'Processing Thai TV5' tz = pytz.timezone('Asia/Bangkok') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Read 7 days maximum worth of programme, put maxdays+1 so we can get info about the last program of the day days = min(7, maxdays+1) parser = Scraper(statethaitv5, completeclass=True) for i in range(days): url = '%s?date=%04d%02d%02d' % (thaitv5url, date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) result = self.time_title(parser.processed) parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from Thai TV5" #raise Error("Couldn't extract any content from Thai TV5",413) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = thaitv5idname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'th' finishtab[-1]['name'] = thaitv5channelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['encoding'] = 'tis-620' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring Thai TV5' if grab_channel['vtv4']: if verbose > 0: print 'Processing VTV4' tz = pytz.timezone('Etc/GMT-7') date1 = date = datetime.datetime.now(tz=tz) + datetime.timedelta(days=offset) #Read 6 days maximum worth of programme, put maxdays+1 so we can get info about the last program of the day days = min(6, maxdays+1) parser = Scraper(statevtv4, completeclass=True) for i in range(days): url = '%s/%04d/%d/%d' % (vtv4url, date.year, date.month, date.day) if verbose > 0: print 'Reading url=%s' % url try: handle = Web(url) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() parser.feed(data,ignoretag='br') date += datetime.timedelta(days=1) result = self.time_title(parser.processed) parser.close() if not result: print >> sys.stderr, "Couldn't extract any content from VTV4" #raise Error("Couldn't extract any content from VTV4",414) else: result[:0] = ['date','%02d/%02d/%04d' % (date1.day,date1.month,date1.year)] finishtab.append(copy.deepcopy(finishtemplate)) finishtab[-1]['id'] = vtv4idname finishtab[-1]['result'] = result finishtab[-1]['lang'] = 'vn' finishtab[-1]['name'] = vtv4channelname finishtab[-1]['timezone'] = calc_timezone(date1) finishtab[-1]['encoding'] = 'utf-8' finishtab[-1]['dst'] = 0 else: if verbose > 0: print 'Ignoring VTV4' #Read Nasa XMLTV from TPG docnasa = False if grab_channel['tpgnasa']: if verbose > 0: print 'Processing TPG NASA channel' tz = pytz.timezone('UTC') try: handle = Web(tpgnasaurl) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() docnasa = xmltv.filterxmltv(data,0,offset=offset, days=maxdays, removepattern = [r'(\S+?)\.tpg\.com\.au', r'tpg.\1']) #Read XMLTV from TPG doctpg = False if grab_channel['tpg']: if verbose > 0: print 'Processing TPG channels' tz = pytz.timezone('UTC') try: handle = Web(tpgurl) except Error, e: print >> sys.stderr, 'Error:', e.code, e.message else: data = handle.read() handle.close() doctpg = xmltv.filterxmltv(data,0,offset=offset, days=maxdays, removepattern = [r'(\S+?)\.tpg\.com\.au', r'tpg.\1']) #Generate XML for x in finishtab: xmltv.addchannel(x['name'], idprefix + x['id'], lang=x['lang']) if docnasa: xmltv.mergechannels(docnasa) if doctpg: xmltv.mergechannels(doctpg) for x in finishtab: if verbose > 0: print 'addqueue of', idprefix + x['id'], offset, maxdays, x['dst'], x['timezone'], x['marker'], x['encoding'] xmltv.addqueueprogramme(x['result'], idprefix + x['id'], offset=offset, days=maxdays, dst=x['dst'], timezone=x['timezone'], marker=x['marker'], encoding=x['encoding']) if docnasa: xmltv.mergeprogrammes(docnasa) if doctpg: xmltv.mergeprogrammes(doctpg) # Print/Write our newly created XML if outfilename: output2 = open(outfilename,'w') if converttz: doc = xmltv.adjusttimezone(xmltv.doc,timezone) if pretty: output2.write(doc.toprettyxml(indent=" ", encoding="utf-8")) else: output2.write(doc.toxml(encoding="utf-8")) else: if pretty: output2.write(xmltv.toprettyxml()) else: output2.write(xmltv.toxml()) if outfilename: output2.close() return xmltv if __name__ == '__main__': try: opts, args = getopt.getopt(sys.argv[1:], \ "hvo:d:c:", ["help", "version", "quiet", "verbose", "days=", "pretty", "output=", "configure", "capabilities", "offset=", "config-file=", "preferredmethod", "description"]) except getopt.GetoptError: # print help information and exit: print "Unrecognised option: " usage() sys.exit(2) output = sys.stdout pretty = False inconfigure = 0 conf = None for o, a in opts: if o in ("-h", "--help"): usage() sys.exit(0) elif o == "--version": print >> sys.stderr, NAME, VERSION sys.exit(0) elif o == "--capabilities": print CAPABILITIES sys.exit(0) elif o == "--pretty": pretty = True elif o == "--preferredmethod": print PREFERREDMETHOD sys.exit(0) elif o == "--description": print DESCRIPTION sys.exit(0) elif o == "--quiet": verbose -= 50 elif o == "--verbose": verbose += 1 elif o in ("-d", "--days"): maxdays = int(a) if maxdays < 1: print >> sys.stderr, "invalid number of days" sys.exit(2) elif o == "--offset": day_offset = int(a) elif o in ("-o", "--output"): output = open(a,'w') elif o == "--configure": inconfigure = 1 elif o in ("-c", "--config-file"): config_file = a conf = Config(config_file, inconfigure) if inconfigure: conf.initconfig() conf.write() sys.exit(0) if conf.exists == False: print >> sys.stderr, "Config file missing run with --configure" sys.exit(2) #Read config localtimezone = time_zone[conf.config['TimeZone']][0] dst = time_zone[conf.config['TimeZone']][1] tv_grab=TVGRAB() try: tv_grab.main(conf,output=output, pretty=pretty,converttz=True,timezone=localtimezone,maxdays=maxdays,offset=day_offset) except Error, e: print >> sys.stderr, e.message print >> sys.stderr, 'Error:', e.code sys.exit(2) if output != sys.stdout: output.close() ################################################################# # History # 24/11/2008: version 0.4.6 # 24/11/2008: Updated TRT International change of URL and format # 08/11/2008: version 0.4.5 # 08/11/2008: Work around SCTV not using valid times # 05/11/2008: version 0.4.4 # 05/11/2008: Remove DST calculations for the time being # 05/10/2008: version 0.4.3 # 05/10/2008: Added NASA TV # 11/06/2008: version 0.4.2 # 11/06/2008: Updated World of Fashion # 11/05/2008: version 0.4.0 # 11/05/2008: Updated grabber for DW, Duna, BVN, TRT International. Grabbing from original web site instead # 07/05/2008: version 0.3.1 # 07/05/2008: Changed User Agent String to look like we are using IE7 on Windows XP # 05/05/2008: version 0.3.0 # 05/05/2008: Add CCTV channels, clean up html for TVE # 01/05/2008: version 0.2.0 # 01/05/2008: Add TVRI, World of Fashion. Re-order channels to be similar to TPG's listing # 01/05/2008: Add Thai TV5, VTV4 # 30/04/2008: version 0.1.7 # 30/04/2008: Add Eurosport, Eurosport News and Bloomberg # 29/04/2008: version 0.1.6 # 29/04/2008: Convert date/time to local timezone # 26/04/2008: version 0.1.5 # 26/04/2008: TV5 updated, add movie classic, playboy tv and adult one # 19/04/2008: version 0.1.4 # 19/04/2008: Do not stop if a channel can't be processed # 18/04/2008: version 0.1.3 # 18/04/2008: Added Voice of America # 16/04/2008: version 0.1.2 # 16/04/2008: Add conversion of html escape code to utf-8. Added support for different encoding than iso8859-1. Added Greek TV and Malaysian TV # 15/04/2008: version 0.1.1 # 15/04/2008: Add automatic category # 15/04/2008: version 0.1.0 # 15/04/2008: Added another work around for TVE, sometimes they have midnight showing like 24:00:00 # 13/04/2008: version 0.0.9 # 13/04/2008: Added SCTV, AsiaNews, Russia Today # 12/04/2008: version 0.0.8 # 12/04/2008: Added Cuba Vision and SCTV # 11/04/2008: version 0.0.7 # 10/04/2008: Added cctv4, bvn, duna, trtint # 09/04/2008: version 0.0.6 # 09/04/2008: Add support for lynsat. Added TVE # 08/04/2008: version 0.0.5 # 08/04/2008: Add disclaimer notice. Add TVE # 03/04/2008: version 0.0.4 # 03/04/2008: Rewrote xml generator. Add Al Jazeera # 02/04/2008: Use python datetime module to perform time calculation # 01/04/2008: version 0.0.3 # 31/03/2008: Rewrite of the scraper engine to use an automated state table. Will ease the addition of other channels # 28/03/2008: add handling of DST and support of timezone # 26/03/2008: 0.0.1 : First version