#!/usr/bin/env python ## tv_grab_ar.py ## Copyright (C) 2009 Mauro A. Meloni ## http://bitnegro.blogspot.com/ ## ## This script is based on the work done by grunjol, Lemac and others. ## The original tv_grab_ar is available at ## http://www.argenteam.net/soft/tv_grab_ar ## ## This program is free software: you can redistribute it and/or modify ## it under the terms of the GNU General Public License as published by ## the Free Software Foundation, either version 3 of the License, or ## (at your option) any later version. ## ## This program is distributed in the hope that it will be useful, ## but WITHOUT ANY WARRANTY; without even the implied warranty of ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ## GNU General Public License for more details. ## ## You should have received a copy of the GNU General Public License ## along with this program. If not, see . ## ## ## Version history: ## ## 2009.09.21-1 Fix para multicanal, que tiene mal el nro de TCM ## 2009.09.17-3 Despliegue de genero como subtitulo ## -2 Fix para programas emitidos en dias sucesivos ## 2009.09.17-1 Correccion de stationlist.xml ## 2009.09.16-6 Xmltv Writer ad hoc ## 5 Descarga de canales ordenados por id ## 4 Cache de fichas ## 3 Fixes a temas de encoding ## 2 Recuperacion de descripciones ## 2009.09.16-1 Version inicial ## from BeautifulSoup import BeautifulSoup, BeautifulStoneSoup import codecs import cookielib from copy import copy from datetime import datetime, date, time, timedelta import optparse from os.path import expanduser, join import pickle import re import sys import unicodedata import urllib2 VERSION = '2009.09.21-1' LANG = u'es' DATETIME_TZINFO = ' -0300' DATETIME_FMT = '%Y%m%d%H%M' + DATETIME_TZINFO INVALID_CHARS = r'[^A-Za-z0-9.,;\-\s]' TVTIME_CONFIG_DIR = expanduser('~/.tvtime') XMLTV_CONFIG_DIR = expanduser('~/.xmltv') def unescape (text): return text def parse_style (styledef): style = {} items = styledef.split(';') map(''.strip, items) for item in items: try: (tag, value) = item.split(':') except ValueError: continue style[tag.strip()] = value.strip() return style def remove_html_tags (data): return re.sub(r'<.*?>', '', data) def remove_extra_spaces (data): return re.sub(r'\s+', ' ', data) def remove_letters (data): return re.sub(r'[^-0-9]', '', data) def sec_to_hour (seconds): return '%02d:%02d' % (seconds / 3600, (seconds / 60) % 60) def put_entities (text): text = text.replace('&', '***PLACEHOLDER***') text = text.replace('&', '&') text = text.replace('***PLACEHOLDER***', '&') return text class Writer: def __init__ (self, encoding, source_info_url, source_info_name, generator_info_name, generator_info_url): self.encoding = encoding self.source_info_url = source_info_url self.source_info_name = source_info_name self.generator_info_name = generator_info_name self.generator_info_url = generator_info_url self.channels = [] self.programs = [] def addChannel (self, d): self.channels.append(d) def addProgramme (self, d): self.programs.append(d) def xml_start (self): template = u''' ''' return template % ( self.encoding, self.generator_info_name, self.generator_info_url, self.source_info_name, self.source_info_url, ) def xml_end (self): return u'\n' def channel_to_xml (self, d): output = u' \n' % d['id'] for (text, lang) in d['display-name']: output += u' %s\n' % (lang, text) if d.has_key('icon'): for item in d['icon']: output += u'' % item output += u' \n' return output def program_to_xml (self, d): output = u' \n' % (d['channel'], d['start'], d['stop']) for (text, lang) in d['title']: output += u' %s\n' % (lang, text) if d.has_key('desc'): for (text, lang) in d['desc']: output += u' %s\n' % (lang, text.decode('utf-8')) if d.has_key('category'): # hack for tvtime itemlang = u'' itemtext = u'' for (text, lang) in d['category']: # output += u' %s\n' % (lang, text) itemlang = lang itemtext += text + ', ' if itemtext: output += u' Genero: %s\n' % (itemlang, itemtext[:-2]) output += u' \n' return output def write (self, fh): fh.write(self.xml_start()) for d in self.channels: fh.write(put_entities(self.channel_to_xml(d))) for d in self.programs: fh.write(put_entities(self.program_to_xml(d))) fh.write(self.xml_end()) class TvGrabAr: def __init__ (self, provider = 'MULTICANAL'): self.fichasdb = join(TVTIME_CONFIG_DIR, 'fichas.db') self.options = None self.base_url = None if provider == 'MULTICANAL': self.base_url = 'http://www.buscadormulticanal.com.ar' self.fichas = {} # get first cookie cj = cookielib.CookieJar() self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj)) self.opener.open(self.base_url) def retrieve_channels (self): channels = {} url = '/index.php?template=main_grilla_semanal.tpl' if self.options.verbose: print 'Retrieving %s ... ' % url tree = BeautifulSoup(self.opener.open(self.base_url + url)) select = tree.find('select', attrs={'name': 'canal'}) if select: opchannels = sorted(select.findAll('option')) for opt in opchannels: if not opt['value']: continue (num, name) = opt.string.split('-', 1) # fix para los muchachos de multicanal, que ponen en la web # ponen a TCM en el 37 cuando en TV esta en el 38. try: if int(num) == 37 and name.strip().startswith('TCM'): num = 38 except ValueError: pass channel = XmltvChannel(opt['value'], num, name.strip()) channels[channel.id] = channel if self.options.verbose: print 'Found %d channels online.' % len(channels) return channels def retrieve_descriptions (self, ficha_id): url = '/index.php?verFicha=%d' % ficha_id if not self.fichas.has_key(ficha_id): if self.options.verbose: print 'Retrieving %s ...' % url tree = BeautifulSoup(self.opener.open(self.base_url + url)) tables = tree.findAll('table') if len(tables) == 0: return "", {} sinopsis = tables[0].renderContents() desc = remove_extra_spaces(remove_html_tags(sinopsis)).strip() propiedades = {} propiedad = tables[1].find('tr') while propiedad: mo = re.search(r'class="itemf1".*>(.*)<\/td>.*class="descf1".*>(.*)<\/td>', str(propiedad), re.I + re.S) if mo: propiedades[mo.group(1).strip()] = mo.group(2).strip() propiedad = propiedad.findNextSibling('tr') self.fichas[ficha_id] = (desc, propiedades) else: if self.options.verbose: print 'Skipping %s ...' % url (desc, propiedades) = self.fichas[ficha_id] return desc, propiedades def retrieve_programs (self, channel, day): startDay = datetime.combine(date.today(), time.min) startDay -= timedelta(startDay.weekday()) try: url = '/index.php?template=fgrilla_semanal.tpl&canal=%d&semana=%d' % (channel.id, 0) except ValueError: print channel.id print channel if self.options.verbose: print 'Channel %s' % channel print 'Retrieving %s ...' % url tree = BeautifulSoup(self.opener.open(self.base_url + url)) divs = tree.findAll('div', attrs={'class': 'layerPrograma'}) programs = [] for div in divs: try: title = div['title'] except KeyError: continue style = parse_style(div['style']) prog = XmltvProgram() prog.channel = channel.id prog.title = title try: height = float(remove_letters(style['height'])) top = float(remove_letters(style['top'])) except ValueError: continue prog.duration = height / 64 * 3600 prog.starthour = top / 64 * 3600 broadcasts = int(round(float(remove_letters(style['width'])) / 100, 0)) for d in range(broadcasts): programStartDay = (int(float(remove_letters(style['left'])) - 47) / 98) + d programStartDay = startDay + timedelta(programStartDay) programStartHour = datetime.strptime(sec_to_hour(prog.starthour), '%H:%M') if sec_to_hour(prog.duration) != '24:00': programDuration = datetime.strptime(sec_to_hour(prog.duration), '%H:%M') else: programDuration = datetime.strptime('23:59', '%H:%M') prog.start = datetime.combine(programStartDay.date(), programStartHour.time()) prog.stop = prog.start + timedelta(hours=programDuration.hour, minutes=programDuration.minute) if not self.options.skip_descriptions: mo = re.search(r'index\.php\?verFicha.(\d+)\&', str(div), re.I) if mo: (prog.description, prog.data) = self.retrieve_descriptions(int(mo.group(1))) programs.append(copy(prog)) if self.options.verbose: print 'Found %d programs.' % len(programs) return programs def configure (self): channels = self.retrieve_channels() chanlist = channels.values() chanlist = sorted(chanlist, cmp=lambda x,y: x.id - y.id) add_all = False skip_all = False for channel in chanlist: prompt = 'add channel %s [yes, no, all, none] ? ' % str(channel) if not add_all and not skip_all: reply = None while reply not in ['y', 'yes', 'n', 'no', 'all', 'none', '']: reply = raw_input(prompt).strip().lower() if reply == '' or reply == 'y' or reply == 'yes': channel.enabled = True elif reply == 'n' or reply == 'no': channel.enabled = False elif reply == 'all': add_all = True elif reply == 'none': skip_all = True elif add_all: channel.enabled = True print prompt + 'yes' elif skip_all: channel.enabled = False print prompt + 'no' conf = codecs.open(self.options.config_file, 'w', 'UTF-8') for channel in chanlist: line = 'channel %d %s\n' % (channel.id, channel.name) if channel.enabled: conf.write(line) else: conf.write('#' + line) conf.close() print 'Finished configuration.' def set_enabled_channels (self, channels): for id in channels: channels[id].enabled = False enabled = 0 for line in open(self.options.config_file): (chan, id, name) = line.split(' ', 2) if chan != 'channel': continue enabled += 1 if int(id) in channels: channels[int(id)].enabled = True if self.options.verbose: print 'Found %d channels enabled.' % enabled def fix_stationlist (self, channels): if self.options.verbose: print 'Fixing %s names ... ' % self.options.station_file, fh = open(self.options.station_file, 'r') tree = BeautifulSoup(fh) fh.close() stations = tree.findAll('station') if not stations: if self.options.verbose: print 'not a stationlist.xml file.' return for station in stations: if not station['channel'].isdigit(): continue number = int(station['channel']) item = [chan for chan in channels if chan.number == number] if item: station['name'] = item[0].name try: fh = open(self.options.station_file, 'w') fh.write(put_entities(tree.prettify())) fh.close() if self.options.verbose: print 'done.' except IOError: if self.options.verbose: print 'could not write %s.' % self.options.station_file def sort_programs (self, programs): return sorted(programs, cmp=lambda x,y: (x.start - y.start).days * 24 * 3600 + (x.start - y.start).seconds) def grab (self): if self.options.verbose: print 'Getting list of channels' channels = self.retrieve_channels() self.set_enabled_channels(channels) channels = channels.values() channels = sorted(channels, cmp=lambda x,y: x.id - y.id) if self.options.fix_stationlist: self.fix_stationlist(channels) if self.options.verbose: print 'Reading program card cache ... ', try: self.fichas = pickle.load(open(self.fichasdb, 'r')) if self.options.verbose: print '%d programs known.' % len(self.fichas) except IOError: if self.options.verbose: print 'could not load %s' % self.fichasdb if not self.options.list_channels: programs = [] for channel in channels: if not channel.enabled: continue programs += self.retrieve_programs(channel, date.today()) programs = self.sort_programs(programs) if self.options.verbose: print 'Saving program card cache ... ', try: pickle.dump(self.fichas, open(self.fichasdb, 'w')) if self.options.verbose: print 'done.' except IOError: if self.options.verbose: print 'could not write %s' % self.fichasdb xml = Writer( \ encoding='UTF-8', source_info_url='http://www.buscadormulticanal.com.ar/', source_info_name='http://www.buscadormulticanal.com.ar/', generator_info_name='tv_grab_ar.py ' + VERSION, generator_info_url='http://bitnegro.blogspot.com/' ) for channel in channels: if channel.enabled: xml.addChannel(channel.get_dict()) if not self.options.list_channels: for program in programs: xml.addProgramme(program.get_dict()) if options.output: fh = codecs.open(options.output, 'w', 'UTF-8') else: fh = sys.stdout xml.write(fh) class XmltvChannel: def __init__ (self, id, number, name): self.id = int(id) self.number = int(number) self.name = name self.icon = None self.url = None self.enabled = True def get_dict (self): d = {} d['id'] = unicode(self.id) d['display-name'] = [ (unicode(self.number), LANG), (unicode(unescape(self.name)), LANG) ] if self.icon: d['icon'] = [unicode(self.icon)] if self.url: d['url'] = unicode(self.url) return d def __str__ (self): return '%02d - %s (id %d)' % (self.number, self.name.encode('utf-8'), self.id) class XmltvProgram: def __init__ (self): self.channel = None self.start = None self.stop = None self.title = '' self.description = '' self.duration = None self.starthour = None self.data = None def get_dict (self): d = {} d['channel'] = unicode(self.channel) d['title'] = [(unicode(unescape(self.title)), LANG)] if self.description: d['desc'] = [(self.description, LANG)] d['start'] = unicode(self.start.strftime(DATETIME_FMT)) d['stop'] = unicode(self.stop.strftime(DATETIME_FMT)) if self.data: key = 'G\xc3\xa9nero' if self.data.has_key(key): cats = [] for cat in self.data[key].split(','): cats.append((unicode(unescape(cat.decode('utf-8').strip())), LANG)) d['category'] = cats key = 'País' if self.data.has_key(key): d['country'] = [(self.data[key], LANG)] key = 'Año' if self.data.has_key(key): d['date'] = [unicode(unescape(self.data[key]))] #audio #credits #episode-num #language #last-chance #length #new #orig-language #premiere #previously-shown #rating #star-rating #sub-title #subtitles #video return d def __str__ (self): retval = '' if self.channel is not None: retval = 'Channel: \t%s\n' % self.channel retval += 'Title: \t%s\n' % self.title.encode('utf-8') retval += 'Description: \t%s\n' % self.description if self.start is not None: retval += 'Start: \t%s\n' % self.start if self.stop is not None: retval += 'Stop: \t%s\n' % self.stop if self.duration is not None: retval += 'Duration: \t%s\n' % sec_to_hour(self.duration) if self.starthour is not None: retval += 'Start Hour: \t%s\n' % sec_to_hour(self.starthour) retval += 'Data:\n' if self.data: for item in self.data: retval += '\t%s: %s\n' % (item, self.data[item]) return retval if __name__ == '__main__': parser = optparse.OptionParser( version='%prog ' + VERSION, description='Get Argentinian television listings in XMLTV format' ) parser.add_option('--days', type='int', dest='days', default=3, metavar='N', help='Grab N days. The default is 3. [not implemented]') parser.add_option('--offset', type='int', dest='offset', default=0, metavar='N', help='Start N days in the future. The default is to start from today. [not implemented]') parser.add_option('--skip-descriptions', action='store_true', dest='skip_descriptions', default=False, help='Do not download program descriptions.') parser.add_option('--output', dest='output', metavar='FILE', help='Write to FILE rather than standard output.') parser.add_option('--fix-stationlist', action='store_true', dest='fix_stationlist', help='Fill the channel names into the station list file.') parser.add_option('--station-file', dest='station_file', metavar='FILE', default=join(TVTIME_CONFIG_DIR, 'stationlist.xml'), help='Set the name of the station list file, the default is <' + join(TVTIME_CONFIG_DIR, 'stationlist.xml') + '>') parser.add_option('--configure', action='store_true', dest='configure', help='Prompt for which channels and write the configuration file.') parser.add_option('--config-file', dest='config_file', metavar='FILE', default=join(XMLTV_CONFIG_DIR, 'tv_grab_ar.conf'), help='Set the name of the configuration file, the default is <' + join(XMLTV_CONFIG_DIR, 'tv_grab_ar.conf') + '>. This is the file written by --configure and read when grabbing.') parser.add_option('--quiet', action='store_true', dest='quiet', default=False, help='Suppress the progress messages normally written to standard error.') parser.add_option('--verbose', action='store_true', dest='verbose', default=False, help='Display additional information.') parser.add_option('--list-channels', action='store_true', dest='list_channels', help='Display only the channel listing.') (options, args) = parser.parse_args() if options.days < 0: parser.error('number of days must not be negative') app = TvGrabAr() app.options = options if options.verbose: print 'tv_grab_ar.py %s\n' % VERSION if options.configure: app.configure() else: app.grab()