#!/usr/bin/env python3
# File: coupons.py
# Location: .
# Author: bgstack15
# Startdate: 2022-08-31
# Title: Json Parser of Publix sales
# Project: coupons
# Purpose: Parse json for coupons that match lowercase string
# Usage:
#    called from check-sales.sh
#    Search with a lower-case string, against the lowercase values of coupon titles.
#       <input.json ./coupons.py --stdin --search 'candy'
# History:
#    I attempted to write similar logic with jq, but python is way easier
# Reference:
#    [internal] rod2/rod.py
# Improve:
# Documentation: README.md
import sys, json, requests, os, datetime, re, textwrap

# Ripped from https://stackoverflow.com/questions/26105659/how-to-convert-the-unicode-to-latin-characters-python/61551939#61551939
from typing import Optional
import html, unicodedata

class Config:
   def __init__(self, cache_dir = None):
      if "" == cache_dir or cache_dir is None:
         try:
            cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache
         except:
            pass
      if "" == cache_dir or cache_dir is None:
         try:
            cache_dir = os.path.join(os.environ.get("HOME"),".cache")
         except:
            cache_dir = "/tmp"
      #print(f"DEBUG(Config.init): cache_dir {cache_dir}")
      self.cache_dir = cache_dir

def normalize(value: str, encoding: Optional[str] = None) -> str:
    """
    Normalize characters not maintainable when encode.
    The default encoding is "ascii".
    """
    if encoding is None:
        return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = value.encode(encoding, 'backslashreplace').decode(encoding)
    value = value.encode('ascii', 'xmlcharrefreplace').decode('unicode-escape')
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    return html.unescape(value)

store_urls = {
   "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106",
   "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345",
   "food lion": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000a/widget.json?callback=jQuery111104817919592912373_1662672814198&_=1662672814199",
   "lidl": "https://clipmunk.southernsavers.com/stores/59405bea724edc4175003366/widget.json?callback=jQuery111104720958887493587_1662672848590&_=1662672848591"
}
coupons_version = "2022-09-01a"

def fetch(store, force = False, date = None, config = None):
   """ Given a store name, visit the url and clean the json. If force, then update cached response."""
   # Reference:
   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
   if store is None or store not in [f for f in store_urls]:
      print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr)
      return -1
   # validate date
   today = datetime.datetime.today().date()
   if date is None:
      date = today.strftime("%F")
   try:
      if date is not None:
         ddate = datetime.datetime.strptime(date,"%Y-%m-%d").date()
         if ddate > today:
            ddate = today
            print(f"WARNING(fetch): date {date} is in the future. Using {today} instead.")
   except:
      ddate = today
      print(f"WARNING(fetch): date {date} is invalid YYYY-MM-DD. Using {today} instead.")
   try:
      date = ddate.strftime("%F")
   except:
      pass
   if date is not None and date != today.strftime("%F"):
      print(f"DEBUG(fetch): using date {date}")
   # try to use cache at first
   contents = None
   if not force:
      contents = None
      contents = get_cached_contents(store, date, config) # it is safe to return None
   # So if force == True, or the cache failed
   if contents is None or "" == contents:
      print(f"INFO (fetch): no cached content for {store},{date}, so visiting url",file=sys.stderr)
      try:
         url = store_urls[store.lower()]
      except:
         print(f"ERROR (fetch): no url saved for store {store}",file=sys.stderr)
      r = requests.get(url)
      contents = r.text
      # try to save to cache, but it is not a blocker
      #try:
      if True:
         set_cached_contents(store, date, contents, config)
      #except:
      #   pass
   return contents

def clean(contents):
   """ Clean the javascript from southernsavers.com widget.json response. """
   # Reference:
   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
   a = re.sub("^.*lists: \\[","[",contents)
   a = re.sub("\],\\\\n.*$","]",a)
   a = re.sub("\\\\\\\\[uU]003[eE]",">",a)
   a = re.sub("\\\\\\\\[uU]003[cC]","<",a)
   a = re.sub("\\\\\\\\[uU]0026","&",a)
   a = re.sub("\\\\\\\\[uU]201[cCdDeEfF]",'"',a)
   a = re.sub("\\\\\\\\[uU]201[89aA]","'",a)
   contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a))
   return contents

def get_cached_name(store, date = None, config = None):
   """
   Given store name, return cache filename regardless of existence or contents.
   """
   USE_CACHE = True
   store = store.lower()
   if config is None:
      #print(f"DEBUG(get_cached_name): must generate new config")
      config = Config()
   cache_dir = config.cache_dir
   # use an app-specific dir underneath it
   cache_dir = os.path.join(cache_dir, "coupons")
   if not os.path.isdir(cache_dir):
      try:
         os.mkdir(cache_dir)
      except:
         # caching is not available; but this should not stop the program
         print(f"INFO(get_cached_name): cannot create cache directory {cache_dir}.")
         USE_CACHE = False
   if USE_CACHE:
      if date is None:
         date = datetime.datetime.today().strftime("%F")
      cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json")
      #print(f"DEBUG(get_cached_name): generated path is {cache_file}")
      return cache_file
   print(f"DEBUG(get_cached_name): no cache filename generated.")
   return None

def get_cached_contents(store, date = None, config = None):
   """
   Given store name, get cached contents
   Also, use today's date if not given a specific one.
   """
   cache_file = get_cached_name(store, date, config)
   if cache_file is not None and os.path.exists(cache_file):
      try:
         print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr)
         return open(cache_file,"r").read()
      except:
         print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr)
   return None

def set_cached_contents(store, date = None, contents = None, config = None):
   """
   Write the large js+json payload to a cache file, if possible. This is low-priority.
   """
   if contents is None or "" == contents:
      return True # cache nothing so short-circuit
   #print(f"DEBUG(set_cached_contents): contents length {len(contents)}")
   if date is None:
      date = datetime.datetime.today().strftime("%F")
   #print(f"DEBUG(set_cached_contents): using date {date}")
   store = store.lower()
   cache_file = get_cached_name(store, date, config)
   if cache_file is not None:
      print(f"DEBUG(set_cached_contents): saving cache {cache_file} size {len(contents)}")
      with open(cache_file,"w") as w:
         # flatten weird characters into normal ones
         #w.write(contents.encode('utf-8').decode('latin-1'))
         w.write(normalize(contents))

def parse_coupons(inputobject, searchstring = None):
   """
   Main logic to simplify the json down as well as return only results that match searchstring which should be lowercase.
   """
   a = inputobject
   b = json.loads(a)
   if searchstring is None:
      searchstring = ""
   response = {}
   for group in b:
      rgroup = group["name"]
      #response[rgroup] = {}
      for c in group["categories"]:
         rcat = c["name"]
         #response[rgroup][rcat] = []
         for i in c["items"]:
            print(f"item = {i}")
            try:
               text = i["html"]
            except:
               text = i["name"]
               add_text = ""
               for h in i["items"] if "items" in i else []:
                  add_text += "<li>" + str(h["html"] if "html" in h else "") + str(h["notes"] if "notes" in h else "") + "</li>"
               if "" != add_text:
                  text += "<ul>" + add_text + "</ul>"
            if "notes" in i and i["notes"] != "":
               text = text + " <small>" + i["notes"] + "</small>"
            #if searchstring in text.lower():
            if re.match(".*(" + searchstring.lower() + ").*", text.lower()):
               # only make this group and category if we have a match
               if rgroup not in response:
                  response[rgroup] = {}
               if rcat not in response[rgroup]:
                  response[rgroup][rcat] = []
               response[rgroup][rcat].append(text)
   return(json.dumps(response))

def fetch_and_search(store, force = False, date = None, searchstring = None, config = None):
   """ Main usage of the whole library for cli. """
   a = clean(fetch(store, force, date, config))
   return parse_coupons(a, searchstring)

if "__main__" == __name__:
   import argparse
   parser = argparse.ArgumentParser(prog = sys.argv[0], description = "Search currently listed sales/coupons on SouthernSavers.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""To use the cache file as standard input, run:
   <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty

Basic usage:
   ./coupons.py --store "publix" --search "candy"
"""))
   parser.add_argument("-n","--nocache","--nc","--no-cache", action = "store_true", help = "Skip the cache and always visit site.")
   parser.add_argument("-d", "--date", help = "Use different YYYY-MM-DD than today, for cache purposes. Does not affect visiting the website")
   parser.add_argument("-s","--search", help = "Search for items that match this, when converted to lowercase. Can leave blank to display all items")
   parser.add_argument("--store", help = f"Select sales from this store.", choices = [f for f in store_urls])
   parser.add_argument("--stdin", action = "store_true", help = "Pipe stdin to parse_coupons. Can still use --search")
   parser.add_argument("--clean", action = "store_true", help = "If using --stdin, also clean the whole javascript input into just the useful json part.")
   parser.add_argument("-p","--pretty","--prettyprint","--pretty-print", action = "store_true", help = "Pretty-print json output")
   parser.add_argument("-V|--version", action = "version", version = coupons_version)
   args = parser.parse_args()
   #print(args,file=sys.stderr)
   a = None
   if args.stdin:
      if args.clean:
         a = parse_coupons(clean(sys.stdin.read()),args.search)
      else:
         a = parse_coupons(sys.stdin.read(),args.search)
   else:
      a = fetch_and_search(args.store,args.nocache,args.date,args.search,config = None)
   if args.pretty:
      print(json.dumps(json.loads(a),indent=3))
   else:
      print(a)