# -*- coding: iso-8859-15 -*- # Needed for German umlauts ################################################################## # # ARD EPG pages scraper # Output is XMLTV format (see http://wiki.xmltv.org/index.php/XMLTVFormat) # Data source: https://programm.ard.de/ # For details refer to: https://projects.webvoss.de/2019/04/14/legal-epg-scraper-for-ard-tv-stations-to-use-with-tvheadend-external-xmltv-grabber/ # # Usage: python3 GrabARD.py # (python 2 may be used - watch for code adjustment requirements in the comments below) # # Version 4 by Hauke April 18th, 2019 (first stable version) # Version 5 by Hauke April 22nd, 2019 (Category-Keyword assignments in seperate file & more detailed) # # !!! Private use only !!! # !!! Do not publish results !!! # Everything else violates ARD policy! # ################################################################### import urllib.request as Webclient # Run web requests from bs4 import BeautifulSoup as WebScraper # Framework to scrape data from HTML responses (apt-get install python3-bs4) from lxml import etree as XMLtree # To create XML file for XMLtv (apt-get install python3-lxml) # If you miss lxml, use instead: # from xml.etree import ElementTree as XMLtree # and change all WebScraper-calls in the code from parser "lxml" to "html.parser" - and suffer a factor 2-4 performance penalty :-( # Also, XML output routine needs adjustment at the end of the code, and '
'/'
' needs change at one place import time # Handle dates and times with timezone offsets import os # To set timezone and check for files ##################### ### Configuration ### ##################### # Keyword to category assignments file CategoryAssignmentsFileName = "CategoryAssignments.txt" # Where to store uncategorized keywords UnknownKeywordsFileName = "UnknownKeywords.txt" # Name of output file XMLtvFileName = "ARD.xml" # Number of days to scrape from EPG DaysToGrab = 14 # Debug mode? DebugMode = False ### End Configuration ### # Read in the keyword to category assignments (if they exist) CategoryAssignments = {} if os.path.exists(CategoryAssignmentsFileName): with open(CategoryAssignmentsFileName, "r") as AssignmentsFile: Assignments = AssignmentsFile.read().splitlines() AssignmentsFile.close() CurrentCategory = "" CurrentKeywords = [] for Line in Assignments: Assignment = Line.strip() if Assignment == "": # skip pass elif Assignment[0] == "#": # comment pass elif Assignment[-1] == ":": if (len(CurrentKeywords) > 0) and (CurrentCategory != ""): CategoryAssignments[CurrentCategory] = CurrentKeywords CurrentCategory = Assignment[0:-1].strip() CurrentKeywords = [] else: CurrentKeywords.append(Assignment) # Don't forget last assignments if (len(CurrentKeywords) > 0) and (CurrentCategory != ""): CategoryAssignments[CurrentCategory] = CurrentKeywords else: print ("Warining: No category assignments found - categories disabled") # Determine time and set timezone to Germany (obviously correct for German TV...) Today = time.time() os.environ['TZ'] = 'Europe/Berlin' # Do not use CET or CEST - Europe/Berlin will dynamically determine the correct code # Init XML structure XMLtv = XMLtree.Element('tv', attrib={"generator-info-name":"GrabARD.py from https://projects.webvoss.de", "generator-info-url":"https://projects.webvoss.de/2019/04/14/legal-epg-scraper-for-ard-tv-stations-to-use-with-tvheadend-external-xmltv-grabber/", "source-info-url":"https://www.ard.de", "source-info-name":"ARD","source-data-url":"https://programm.ard.de/"}) # ARD EPG page base BaseURL = 'https://programm.ard.de' # Get EPG startpage that contains list of all stations and their EPG IDs ARDpageRequest = Webclient.urlopen(BaseURL + '/TV/Programm/Sender') ARDstartPage = WebScraper(ARDpageRequest, 'lxml') # Get IDs TVstations = ARDstartPage.find_all('option', 'senderselektor_option') # Create channels in XML for TVstation in TVstations: StationID = TVstation['value'] StationName = TVstation.text Channel = XMLtree.SubElement (XMLtv, "channel", id=StationID) XMLtree.SubElement (Channel, "display-name").text = StationName XMLtree.SubElement (Channel, "url").text = BaseURL + '/TV/Programm/Sender?sender=' + StationID # Load list of previously uncategorized keywords from file (if file exists) if os.path.exists(UnknownKeywordsFileName): with open(UnknownKeywordsFileName, "r") as KeywordsFile: UnknownKeywordsList = KeywordsFile.read().splitlines() KeywordsFile.close() else: UnknownKeywordsList = [] # collect keywords that are not matched to a category # Now scrape EPG overview pages of each station for TVstation in TVstations: StationID = TVstation['value'] StationName = TVstation.text if DebugMode: print ("Station: " + StationName) # Get requested days - start at yesterday, since EPG pages are offset by 5 hours (why???) for day in range(DaysToGrab + 1): # Create base date for XML start and end times and for daily URL QueryDate = Today + ((day - 1) * 86400) # Start at day before ShowDate = time.strftime("%Y%m%d", time.localtime(QueryDate)) OldShowDate = ShowDate # Needed for VPS if DebugMode: print ("Day: " + str(day) + " (" + ShowDate + ") for " + StationName) # Get daily overview EPG page for given station ID EPGrequest = Webclient.urlopen(BaseURL + '/TV/Programm/Sender?sender=' + StationID + '&datum=' + time.strftime("%d.%m.%Y", time.localtime(QueryDate))) # Search for all show pages EPGpage = WebScraper(EPGrequest, 'lxml') Links = EPGpage.find_all('a', 'sendungslink') # Cycle through all shows and get the detail pages for show information LastEndHour = "00" # to detect midnight rollover for Link in Links: # Get URL of show details page LinkTarget = Link.get('href') DetailWebResponse = Webclient.urlopen(BaseURL + LinkTarget) DetailsPage = WebScraper(DetailWebResponse, 'lxml') # Find start and end time TimePart = str(DetailsPage.find('div', 'small-detail')) TimeInformation = TimePart.split("
")[1].split("Uhr")[0].split(" - ") # with html.parser "
"! StartTime = TimeInformation[0].split(":") if StartTime[0].strip() < LastEndHour: # Midnight rollover just before Show QueryDate += 86400 ShowDate = time.strftime("%Y%m%d", time.localtime(QueryDate)) EndTime = TimeInformation[1].split(":") LastEndHour = EndTime[0].strip() # Construct show start before check of midnight rollover during show (start need old date then) ShowStart = ShowDate + StartTime[0].strip() + StartTime[1].strip() if EndTime[0].strip() < StartTime[0].strip(): # Midnight rollover during Show QueryDate += 86400 ShowDate = time.strftime("%Y%m%d", time.localtime(QueryDate)) ShowEnd = ShowDate + EndTime[0].strip() + EndTime[1].strip() # Add timezone offset according to daylight savings time. # Day of time switch itself is no problem with ARD: They adjust EPG and program to this, so conversion works. # Change "time.localtime" to "time.gmtime" in the following two lines for python 2 LocalStartTime = time.strftime('%Y%m%d%H%M00 %z', time.localtime(time.mktime(time.strptime(ShowStart, '%Y%m%d%H%M')))) LocalEndTime = time.strftime('%Y%m%d%H%M00 %z', time.localtime(time.mktime(time.strptime(ShowEnd, '%Y%m%d%H%M')))) Show = XMLtree.SubElement(XMLtv, "programme", channel=StationID, start=LocalStartTime, stop=LocalEndTime) # If VPS time present, add it VPSinformation = TimePart.split("VPS") if len(VPSinformation) > 1: VPStime = VPSinformation[1].split("<")[0].strip().split(":") if VPStime[0].strip() > EndTime[0].strip(): # VPS time before midnight VPSStart = OldShowDate + VPStime[0].strip() + VPStime[1].strip() else: # VPS time needs to roll over VPSStart = ShowDate + VPStime[0].strip() + VPStime[1].strip() # Change "time.localtime" to "time.gmtime" in the following line for python 2 VPSStartTime = time.strftime('%Y%m%d%H%M00 %z', time.localtime(time.mktime(time.strptime(VPSStart, '%Y%m%d%H%M')))) Show.set('vps-start', VPSStartTime) # Add other information (title, subtitle, description, credits (if present), keywords, length, URL, video and audio description) # Some are in the meta tags (already well formatted as text - nice!), some need to be detected in the HTML code. Title = DetailsPage.find('meta', property='og:title')['content'] XMLtree.SubElement(Show, "title", lang="de").text = Title if DebugMode: # print ("Show: " + Title.encode('utf-8')) # python 2 print ("Show: " + Title) # python 3 SubtitleTag = DetailsPage.find('span', 'subtitle') if SubtitleTag: XMLtree.SubElement(Show, "sub-title", lang="de").text = SubtitleTag.text.split("|")[0].strip() XMLtree.SubElement(Show, "desc", lang="de").text = DetailsPage.find('meta', attrs={"name":"description"})['content'] Credits = DetailsPage.find('table', 'besetzung') if Credits: CreditXML = XMLtree.SubElement(Show, "credits") CreditEntries = Credits.find_all('tr') Role = "actor" for Credit in CreditEntries: LastRole = Role # sometimes role is empty, which implies its the same as line before Role = Credit.find('td', 'role').text.strip() if Role == "": Role = LastRole ActorPart = Credit.find('td', 'actor') Actor = ActorPart.find('a').text.strip() # transform German credits to international if Role == "Regie": XMLtree.SubElement(CreditXML, 'director').text = Actor elif Role in ["Buch", "Drehbuch"]: XMLtree.SubElement(CreditXML, 'writer').text = Actor elif Role == "Redaktion": XMLtree.SubElement(CreditXML, 'editor').text = Actor elif Role == "Moderation": XMLtree.SubElement(CreditXML, 'presenter').text = Actor elif Role in ["Musik", "Komposition"]: XMLtree.SubElement(CreditXML, 'composer').text = Actor else: XMLtree.SubElement(CreditXML, 'actor', role=Role).text = Actor # First keywords are repeat of title etc. - not useful. Useful keywords start after the name of the station in the keywords. # For some stations, the station name in the keywords does not match the "official" name --> First find out the "internal" station name InternalStationName = DetailsPage.find('title').text.split(" - ")[-1].split("|")[0].strip() Keywords = DetailsPage.find('meta', attrs={"name":"keywords"})['content'].split(",") if InternalStationName in Keywords: StartIndex = Keywords.index(InternalStationName) + 1 CategoryList = [] for Keyword in Keywords[StartIndex:]: XMLtree.SubElement (Show, "keyword", lang="de").text = Keyword # Assign categories based on the assignments file Category = "" for Assignment in CategoryAssignments: if Keyword in CategoryAssignments[Assignment]: Category = Assignment break if Category == "": if not (Keyword in UnknownKeywordsList): # Remember any keyword that does not match a category. At the end, put out all such keywords. Allows to improve the mappings above. UnknownKeywordsList.append(Keyword) elif Category != "Uncategorized": # Only add category once per show if not (Category in CategoryList): CategoryList.append(Category) for CategoryItem in CategoryList: XMLtree.SubElement (Show, "category", lang="en").text = CategoryItem XMLtree.SubElement (Show, "length", units="minutes").text = DetailsPage.find('div', 'show-for-medium').text.split(" ")[0].strip() XMLtree.SubElement (Show, "url").text = BaseURL + LinkTarget # Audio and video properties are represented by icons (img) in the page --> Look for the relevant images VideoDescription = XMLtree.SubElement (Show, "video") XMLtree.SubElement (VideoDescription, 'present').text = 'yes' if DetailsPage.find('img', attrs={"title":"HD-TV"}): XMLtree.SubElement (VideoDescription, 'quality').text = 'HDTV' XMLtree.SubElement (VideoDescription, 'aspect').text = '16:9' else: XMLtree.SubElement (VideoDescription, 'quality').text = 'SD' XMLtree.SubElement (VideoDescription, 'aspect').text = '4:3' AudioDescription = XMLtree.SubElement (Show, "audio") XMLtree.SubElement (AudioDescription, 'present').text = 'yes' if DetailsPage.find('img', attrs={"title":"Dolby"}): XMLtree.SubElement (AudioDescription, 'stereo').text = 'dolby' elif DetailsPage.find('img', attrs={"title":"Stereo"}): XMLtree.SubElement (AudioDescription, 'stereo').text = 'stereo' else: XMLtree.SubElement (AudioDescription, 'stereo').text = 'mono' if DetailsPage.find('img', attrs={"title":"Untertitel für Gehörgeschädigte"}): XMLtree.SubElement (Show, 'subtitles', lang="de").text = 'teletext' if DetailsPage.find('img', attrs={"title":"Original mit Untertitel"}): XMLtree.SubElement (Show, 'subtitles', lang="de").text = 'onscreen' else: XMLtree.SubElement (Show, 'language', lang="de").text = 'Deutsch' # Done - now write XML. To add doctype, first a tostring() is needed. lxml.write does not accept doctype (stupid!) XMLtvTree = XMLtree.ElementTree(XMLtv) XMLcontent = XMLtree.tostring(XMLtvTree, pretty_print=True, encoding="utf-8", xml_declaration=True, doctype='') with open(XMLtvFileName, "wb") as XMLfile: XMLfile.write(XMLcontent) XMLfile.close() # No lxml? Use this instead (missing DOCTYPE and also not nicely formatted): # XMLtvTree.write(XMLtvFileName, encoding="utf-8", xml_declaration=True) with open(UnknownKeywordsFileName, "w") as KeywordsFile: for Keyword in UnknownKeywordsList: FilePos = KeywordsFile.write (Keyword + "\n") KeywordsFile.close()