# -*- coding: iso-8859-15 -*-
# Needed for German umlauts

##################################################################
#
# ARD EPG pages scraper
# Output is XMLTV format (see http://wiki.xmltv.org/index.php/XMLTVFormat)
# Data source: https://programm.ard.de/
# For details refer to: https://projects.webvoss.de/2019/04/14/legal-epg-scraper-for-ard-tv-stations-to-use-with-tvheadend-external-xmltv-grabber/
#
# Usage: python3 GrabARD.py
# (python 2 may be used - watch for code adjustment requirements in the comments below)
#
# Version 4 by Hauke April 18th, 2019 (first stable version)
# Version 5 by Hauke April 22nd, 2019 (Category-Keyword assignments in seperate file & more detailed)
#
# !!!    Private use only    !!!
# !!! Do not publish results !!!
# Everything else violates ARD policy!
#
###################################################################

import urllib.request as Webclient            # Run web requests
from bs4 import BeautifulSoup as WebScraper   # Framework to scrape data from HTML responses (apt-get install python3-bs4)
from lxml import etree as XMLtree             # To create XML file for XMLtv (apt-get install python3-lxml)
                                              # If you miss lxml, use instead:
                                              #   from xml.etree import ElementTree as XMLtree
                                              # and change all WebScraper-calls in the code from parser "lxml" to "html.parser" - and suffer a factor 2-4 performance penalty :-(
                                              # Also, XML output routine needs adjustment at the end of the code, and '<br>'/'<br/>' needs change at one place
import time                                   # Handle dates and times with timezone offsets
import os                                     # To set timezone and check for files


#####################
### Configuration ###
#####################

# Keyword to category assignments file
CategoryAssignmentsFileName = "CategoryAssignments.txt"

# Where to store uncategorized keywords
UnknownKeywordsFileName = "UnknownKeywords.txt"   

# Name of output file
XMLtvFileName = "ARD.xml"

# Number of days to scrape from EPG
DaysToGrab = 14

# Debug mode?
DebugMode = False

### End Configuration ###

# Read in the keyword to category assignments (if they exist)
CategoryAssignments = {} 
if os.path.exists(CategoryAssignmentsFileName):
  with open(CategoryAssignmentsFileName, "r") as AssignmentsFile:
    Assignments = AssignmentsFile.read().splitlines()
  AssignmentsFile.close()
  CurrentCategory = ""
  CurrentKeywords = []
  for Line in Assignments:
    Assignment = Line.strip()
    if Assignment == "":
      # skip
      pass
    elif Assignment[0] == "#":
      # comment
      pass
    elif Assignment[-1] == ":":
      if (len(CurrentKeywords) > 0) and (CurrentCategory != ""):
        CategoryAssignments[CurrentCategory] = CurrentKeywords
      CurrentCategory = Assignment[0:-1].strip()
      CurrentKeywords = []
    else:
      CurrentKeywords.append(Assignment)
  # Don't forget last assignments
  if (len(CurrentKeywords) > 0) and (CurrentCategory != ""):
    CategoryAssignments[CurrentCategory] = CurrentKeywords
else:
  print ("Warining: No category assignments found - categories disabled")

# Determine time and set timezone to Germany (obviously correct for German TV...)
Today = time.time()
os.environ['TZ'] = 'Europe/Berlin'    # Do not use CET or CEST - Europe/Berlin will dynamically determine the correct code

# Init XML structure
XMLtv = XMLtree.Element('tv', attrib={"generator-info-name":"GrabARD.py from https://projects.webvoss.de",
                                      "generator-info-url":"https://projects.webvoss.de/2019/04/14/legal-epg-scraper-for-ard-tv-stations-to-use-with-tvheadend-external-xmltv-grabber/",
                                      "source-info-url":"https://www.ard.de",
                                      "source-info-name":"ARD","source-data-url":"https://programm.ard.de/"})

# ARD EPG page base
BaseURL = 'https://programm.ard.de'

# Get EPG startpage that contains list of all stations and their EPG IDs
ARDpageRequest = Webclient.urlopen(BaseURL + '/TV/Programm/Sender')
ARDstartPage = WebScraper(ARDpageRequest, 'lxml')

# Get IDs
TVstations = ARDstartPage.find_all('option', 'senderselektor_option')

# Create channels in XML
for TVstation in TVstations:
  StationID = TVstation['value']
  StationName = TVstation.text
  Channel = XMLtree.SubElement (XMLtv, "channel", id=StationID)
  XMLtree.SubElement (Channel, "display-name").text = StationName
  XMLtree.SubElement (Channel, "url").text = BaseURL + '/TV/Programm/Sender?sender=' + StationID

# Load list of previously uncategorized keywords from file (if file exists)
if os.path.exists(UnknownKeywordsFileName):
  with open(UnknownKeywordsFileName, "r") as KeywordsFile:
    UnknownKeywordsList = KeywordsFile.read().splitlines()
  KeywordsFile.close()
else:
  UnknownKeywordsList = []    # collect keywords that are not matched to a category

# Now scrape EPG overview pages of each station
for TVstation in TVstations:
  StationID = TVstation['value']
  StationName = TVstation.text

  if DebugMode:
    print ("Station: " + StationName)

  # Get requested days - start at yesterday, since EPG pages are offset by 5 hours (why???)
  for day in range(DaysToGrab + 1):

    # Create base date for XML start and end times and for daily URL    
    QueryDate = Today + ((day - 1) * 86400)       # Start at day before 
    ShowDate = time.strftime("%Y%m%d", time.localtime(QueryDate))
    OldShowDate = ShowDate                        # Needed for VPS

    if DebugMode:
      print ("Day: " + str(day) + " (" + ShowDate + ") for " + StationName)
    
    # Get daily overview EPG page for given station ID
    EPGrequest = Webclient.urlopen(BaseURL + '/TV/Programm/Sender?sender=' + StationID + '&datum=' + time.strftime("%d.%m.%Y", time.localtime(QueryDate)))

    # Search for all show pages
    EPGpage = WebScraper(EPGrequest, 'lxml')
    Links = EPGpage.find_all('a', 'sendungslink')

    # Cycle through all shows and get the detail pages for show information
    LastEndHour = "00"    # to detect midnight rollover
    for Link in Links:
      
      # Get URL of show details page
      LinkTarget = Link.get('href')
      DetailWebResponse = Webclient.urlopen(BaseURL + LinkTarget)
      DetailsPage = WebScraper(DetailWebResponse, 'lxml')

      # Find start and end time
      TimePart = str(DetailsPage.find('div', 'small-detail'))
      TimeInformation = TimePart.split("<br/>")[1].split("Uhr")[0].split(" - ")   # with html.parser "<br>"!
      
      StartTime = TimeInformation[0].split(":")
      if StartTime[0].strip() < LastEndHour:
        # Midnight rollover just before Show
        QueryDate += 86400
        ShowDate = time.strftime("%Y%m%d", time.localtime(QueryDate))        

      EndTime = TimeInformation[1].split(":")
      LastEndHour = EndTime[0].strip()

      # Construct show start before check of midnight rollover during show (start need old date then)
      ShowStart = ShowDate + StartTime[0].strip() + StartTime[1].strip()

      if EndTime[0].strip() < StartTime[0].strip():
        # Midnight rollover during Show
        QueryDate += 86400
        ShowDate = time.strftime("%Y%m%d", time.localtime(QueryDate))
        
      ShowEnd = ShowDate + EndTime[0].strip() + EndTime[1].strip()

      # Add timezone offset according to daylight savings time.
      # Day of time switch itself is no problem with ARD: They adjust EPG and program to this, so conversion works.
      # Change "time.localtime" to "time.gmtime" in the following two lines for python 2
      LocalStartTime = time.strftime('%Y%m%d%H%M00 %z', time.localtime(time.mktime(time.strptime(ShowStart, '%Y%m%d%H%M'))))
      LocalEndTime = time.strftime('%Y%m%d%H%M00 %z', time.localtime(time.mktime(time.strptime(ShowEnd, '%Y%m%d%H%M'))))
      Show = XMLtree.SubElement(XMLtv, "programme", channel=StationID, start=LocalStartTime, stop=LocalEndTime)

      # If VPS time present, add it
      VPSinformation = TimePart.split("VPS")
      if len(VPSinformation) > 1:
        VPStime = VPSinformation[1].split("<")[0].strip().split(":")
        if VPStime[0].strip() > EndTime[0].strip():
          # VPS time before midnight
          VPSStart = OldShowDate + VPStime[0].strip() + VPStime[1].strip()
        else:
          # VPS time needs to roll over
          VPSStart = ShowDate + VPStime[0].strip() + VPStime[1].strip()
        # Change "time.localtime" to "time.gmtime" in the following line for python 2
        VPSStartTime = time.strftime('%Y%m%d%H%M00 %z', time.localtime(time.mktime(time.strptime(VPSStart, '%Y%m%d%H%M'))))
        Show.set('vps-start', VPSStartTime)

      # Add other information (title, subtitle, description, credits (if present), keywords, length, URL, video and audio description)
      # Some are in the meta tags (already well formatted as text - nice!), some need to be detected in the HTML code.
      Title = DetailsPage.find('meta', property='og:title')['content']
      XMLtree.SubElement(Show, "title", lang="de").text = Title

      if DebugMode:
        # print ("Show: " + Title.encode('utf-8'))      # python 2
        print ("Show: " + Title)                      # python 3

      SubtitleTag = DetailsPage.find('span', 'subtitle')
      if SubtitleTag:
        XMLtree.SubElement(Show, "sub-title", lang="de").text = SubtitleTag.text.split("|")[0].strip()

      XMLtree.SubElement(Show, "desc", lang="de").text = DetailsPage.find('meta', attrs={"name":"description"})['content']

      Credits = DetailsPage.find('table', 'besetzung')
      if Credits:
        CreditXML = XMLtree.SubElement(Show, "credits")
        CreditEntries = Credits.find_all('tr')
        Role = "actor"
        for Credit in CreditEntries:
          LastRole = Role   # sometimes role is empty, which implies its the same as line before 
          Role = Credit.find('td', 'role').text.strip()
          if Role == "":
            Role = LastRole
          ActorPart = Credit.find('td', 'actor')
          Actor = ActorPart.find('a').text.strip()
          # transform German credits to international
          if Role == "Regie":
            XMLtree.SubElement(CreditXML, 'director').text = Actor
          elif Role in ["Buch", "Drehbuch"]:
            XMLtree.SubElement(CreditXML, 'writer').text = Actor
          elif Role == "Redaktion":
            XMLtree.SubElement(CreditXML, 'editor').text = Actor
          elif Role == "Moderation":
            XMLtree.SubElement(CreditXML, 'presenter').text = Actor
          elif Role in ["Musik", "Komposition"]:
            XMLtree.SubElement(CreditXML, 'composer').text = Actor
          else:
            XMLtree.SubElement(CreditXML, 'actor', role=Role).text = Actor

      # First keywords are repeat of title etc. - not useful. Useful keywords start after the name of the station in the keywords.
      # For some stations, the station name in the keywords does not match the "official" name --> First find out the "internal" station name
      InternalStationName = DetailsPage.find('title').text.split(" - ")[-1].split("|")[0].strip()
      
      Keywords = DetailsPage.find('meta', attrs={"name":"keywords"})['content'].split(",")
      if InternalStationName in Keywords:
        StartIndex = Keywords.index(InternalStationName) + 1
        CategoryList = []
        for Keyword in Keywords[StartIndex:]:
          XMLtree.SubElement (Show, "keyword", lang="de").text = Keyword

          # Assign categories based on the assignments file
          Category = ""
          for Assignment in CategoryAssignments:
            if Keyword in CategoryAssignments[Assignment]:
              Category = Assignment
              break

          if Category == "":
            if not (Keyword in UnknownKeywordsList):
              # Remember any keyword that does not match a category. At the end, put out all such keywords. Allows to improve the mappings above.
              UnknownKeywordsList.append(Keyword)
          elif Category != "Uncategorized":
            # Only add category once per show
            if not (Category in CategoryList):
              CategoryList.append(Category)

        for CategoryItem in CategoryList:
          XMLtree.SubElement (Show, "category", lang="en").text = CategoryItem

      XMLtree.SubElement (Show, "length", units="minutes").text = DetailsPage.find('div', 'show-for-medium').text.split(" ")[0].strip()

      XMLtree.SubElement (Show, "url").text = BaseURL + LinkTarget

      # Audio and video properties are represented by icons (img) in the page --> Look for the relevant images
      VideoDescription = XMLtree.SubElement (Show, "video")
      XMLtree.SubElement (VideoDescription, 'present').text = 'yes'
      if DetailsPage.find('img', attrs={"title":"HD-TV"}):
        XMLtree.SubElement (VideoDescription, 'quality').text = 'HDTV'
        XMLtree.SubElement (VideoDescription, 'aspect').text = '16:9'
      else:
        XMLtree.SubElement (VideoDescription, 'quality').text = 'SD'
        XMLtree.SubElement (VideoDescription, 'aspect').text = '4:3'

      AudioDescription = XMLtree.SubElement (Show, "audio")
      XMLtree.SubElement (AudioDescription, 'present').text = 'yes'
      if DetailsPage.find('img', attrs={"title":"Dolby"}):
        XMLtree.SubElement (AudioDescription, 'stereo').text = 'dolby'
      elif DetailsPage.find('img', attrs={"title":"Stereo"}):
        XMLtree.SubElement (AudioDescription, 'stereo').text = 'stereo'
      else:
        XMLtree.SubElement (AudioDescription, 'stereo').text = 'mono'
      if DetailsPage.find('img', attrs={"title":"Untertitel für Gehörgeschädigte"}):
        XMLtree.SubElement (Show, 'subtitles', lang="de").text = 'teletext'
      if DetailsPage.find('img', attrs={"title":"Original mit Untertitel"}):
        XMLtree.SubElement (Show, 'subtitles', lang="de").text = 'onscreen'
      else:
        XMLtree.SubElement (Show, 'language', lang="de").text = 'Deutsch'


# Done - now write XML. To add doctype, first a tostring() is needed. lxml.write does not accept doctype (stupid!)
XMLtvTree = XMLtree.ElementTree(XMLtv)
XMLcontent = XMLtree.tostring(XMLtvTree, pretty_print=True, encoding="utf-8", xml_declaration=True, doctype='<!DOCTYPE tv SYSTEM "xmltv.dtd">')
with open(XMLtvFileName, "wb") as XMLfile:
  XMLfile.write(XMLcontent)
  XMLfile.close()

# No lxml? Use this instead (missing DOCTYPE and also not nicely formatted):
# XMLtvTree.write(XMLtvFileName, encoding="utf-8", xml_declaration=True)

with open(UnknownKeywordsFileName, "w") as KeywordsFile:
  for Keyword in UnknownKeywordsList:
    FilePos = KeywordsFile.write (Keyword + "\n")
  KeywordsFile.close()
