1 files changed, 98 insertions, 24 deletions
diff --git a/coupons.py b/coupons.py
index 1e4b964..0356bfb 100755
--- a/coupons.py
+++ b/coupons.py
@@ -18,27 +18,79 @@
 # Documentation: README.md
 import sys, json, requests, os, datetime, re, textwrap
 
+# Ripped from https://stackoverflow.com/questions/26105659/how-to-convert-the-unicode-to-latin-characters-python/61551939#61551939
+from typing import Optional
+import html, unicodedata
+
+class Config:
+   def __init__(self, cache_dir = None):
+      if "" == cache_dir or cache_dir is None:
+         try:
+            cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache
+         except:
+            pass
+      if "" == cache_dir or cache_dir is None:
+         try:
+            cache_dir = os.path.join(os.environ.get("HOME"),".cache")
+         except:
+            cache_dir = "/tmp"
+      #print(f"DEBUG(Config.init): cache_dir {cache_dir}")
+      self.cache_dir = cache_dir
+
+def normalize(value: str, encoding: Optional[str] = None) -> str:
+    """
+    Normalize characters not maintainable when encode.
+    The default encoding is "ascii".
+    """
+    if encoding is None:
+        return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    value = value.encode(encoding, 'backslashreplace').decode(encoding)
+    value = value.encode('ascii', 'xmlcharrefreplace').decode('unicode-escape')
+    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
+    return html.unescape(value)
+
 store_urls = {
    "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106",
-   "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345"
+   "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345",
+   "food lion": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000a/widget.json?callback=jQuery111104817919592912373_1662672814198&_=1662672814199",
+   "lidl": "https://clipmunk.southernsavers.com/stores/59405bea724edc4175003366/widget.json?callback=jQuery111104720958887493587_1662672848590&_=1662672848591"
 }
 coupons_version = "2022-09-01a"
 
-def fetch(store, force = False, date = None):
+def fetch(store, force = False, date = None, config = None):
    """ Given a store name, visit the url and clean the json. If force, then update cached response."""
    # Reference:
    # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
    if store is None or store not in [f for f in store_urls]:
       print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr)
       return -1
+   # validate date
+   today = datetime.datetime.today().date()
+   if date is None:
+      date = today.strftime("%F")
+   try:
+      if date is not None:
+         ddate = datetime.datetime.strptime(date,"%Y-%m-%d").date()
+         if ddate > today:
+            ddate = today
+            print(f"WARNING(fetch): date {date} is in the future. Using {today} instead.")
+   except:
+      ddate = today
+      print(f"WARNING(fetch): date {date} is invalid YYYY-MM-DD. Using {today} instead.")
+   try:
+      date = ddate.strftime("%F")
+   except:
+      pass
+   if date is not None and date != today.strftime("%F"):
+      print(f"DEBUG(fetch): using date {date}")
    # try to use cache at first
    contents = None
    if not force:
       contents = None
-      contents = get_cached_contents(store, date) # it is safe to return None
+      contents = get_cached_contents(store, date, config) # it is safe to return None
    # So if force == True, or the cache failed
    if contents is None or "" == contents:
-      print(f"INFO (fetch): no cached content, so visiting url",file=sys.stderr)
+      print(f"INFO (fetch): no cached content for {store},{date}, so visiting url",file=sys.stderr)
       try:
          url = store_urls[store.lower()]
       except:
@@ -46,10 +98,11 @@ def fetch(store, force = False, date = None):
       r = requests.get(url)
       contents = r.text
       # try to save to cache, but it is not a blocker
-      try:
-         set_cached_contents(store, date, contents)
-      except:
-         pass
+      #try:
+      if True:
+         set_cached_contents(store, date, contents, config)
+      #except:
+      #   pass
    return contents
 
 def clean(contents):
@@ -60,18 +113,22 @@ def clean(contents):
    a = re.sub("\],\\\\n.*$","]",a)
    a = re.sub("\\\\\\\\[uU]003[eE]",">",a)
    a = re.sub("\\\\\\\\[uU]003[cC]","<",a)
+   a = re.sub("\\\\\\\\[uU]0026","&",a)
+   a = re.sub("\\\\\\\\[uU]201[cCdDeEfF]",'"',a)
+   a = re.sub("\\\\\\\\[uU]201[89aA]","'",a)
    contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a))
    return contents
 
-def get_cached_name(store, date = None):
+def get_cached_name(store, date = None, config = None):
    """
    Given store name, return cache filename regardless of existence or contents.
    """
    USE_CACHE = True
    store = store.lower()
-   cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache
-   if "" == cache_dir or cache_dir is None:
-      cache_dir = os.path.join(os.environ.get("HOME"),".cache")
+   if config is None:
+      #print(f"DEBUG(get_cached_name): must generate new config")
+      config = Config()
+   cache_dir = config.cache_dir
    # use an app-specific dir underneath it
    cache_dir = os.path.join(cache_dir, "coupons")
    if not os.path.isdir(cache_dir):
@@ -79,20 +136,24 @@ def get_cached_name(store, date = None):
          os.mkdir(cache_dir)
       except:
          # caching is not available; but this should not stop the program
+         print(f"INFO(get_cached_name): cannot create cache directory {cache_dir}.")
          USE_CACHE = False
    if USE_CACHE:
       if date is None:
          date = datetime.datetime.today().strftime("%F")
       cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json")
-   return cache_file
+      #print(f"DEBUG(get_cached_name): generated path is {cache_file}")
+      return cache_file
+   print(f"DEBUG(get_cached_name): no cache filename generated.")
+   return None
 
-def get_cached_contents(store, date = None):
+def get_cached_contents(store, date = None, config = None):
    """
    Given store name, get cached contents
    Also, use today's date if not given a specific one.
    """
-   cache_file = get_cached_name(store, date)
-   if os.path.exists(cache_file):
+   cache_file = get_cached_name(store, date, config)
+   if cache_file is not None and os.path.exists(cache_file):
       try:
          print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr)
          return open(cache_file,"r").read()
@@ -100,14 +161,24 @@ def get_cached_contents(store, date = None):
          print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr)
    return None
 
-def set_cached_contents(store, date = None, contents = None):
+def set_cached_contents(store, date = None, contents = None, config = None):
+   """
+   Write the large js+json payload to a cache file, if possible. This is low-priority.
+   """
    if contents is None or "" == contents:
       return True # cache nothing so short-circuit
+   #print(f"DEBUG(set_cached_contents): contents length {len(contents)}")
    if date is None:
       date = datetime.datetime.today().strftime("%F")
+   #print(f"DEBUG(set_cached_contents): using date {date}")
    store = store.lower()
-   cache_file = get_cached_name(store, date)
-   open(cache_file,"w").write(contents)
+   cache_file = get_cached_name(store, date, config)
+   if cache_file is not None:
+      print(f"DEBUG(set_cached_contents): saving cache {cache_file} size {len(contents)}")
+      with open(cache_file,"w") as w:
+         # flatten weird characters into normal ones
+         #w.write(contents.encode('utf-8').decode('latin-1'))
+         w.write(normalize(contents))
 
 def parse_coupons(inputobject, searchstring = None):
    """
@@ -126,7 +197,10 @@ def parse_coupons(inputobject, searchstring = None):
          #response[rgroup][rcat] = []
          for i in c["items"]:
             text = i["html"]
-            if searchstring in text.lower():
+            if "notes" in i and i["notes"] != "":
+               text = text + " <small>" + i["notes"] + "</small>"
+            #if searchstring in text.lower():
+            if re.match(".*(" + searchstring.lower() + ").*", text.lower()):
                # only make this group and category if we have a match
                if rgroup not in response:
                   response[rgroup] = {}
@@ -135,9 +209,9 @@ def parse_coupons(inputobject, searchstring = None):
                response[rgroup][rcat].append(text)
    return(json.dumps(response))
 
-def fetch_and_search(store, force = False, date = None, searchstring = None):
-   """ Main usage of the whole library. """
-   a = clean(fetch(store, force, date))
+def fetch_and_search(store, force = False, date = None, searchstring = None, config = None):
+   """ Main usage of the whole library for cli. """
+   a = clean(fetch(store, force, date, config))
    return parse_coupons(a, searchstring)
 
 if "__main__" == __name__:
@@ -165,7 +239,7 @@ Basic usage:
       else:
          a = parse_coupons(sys.stdin.read(),args.search)
    else:
-      a = fetch_and_search(args.store,args.nocache,args.date,args.search)
+      a = fetch_and_search(args.store,args.nocache,args.date,args.search,config = None)
    if args.pretty:
       print(json.dumps(json.loads(a),indent=3))
    else: