diff options
Diffstat (limited to 'coupons.py')
-rwxr-xr-x | coupons.py | 122 |
1 files changed, 98 insertions, 24 deletions
@@ -18,27 +18,79 @@ # Documentation: README.md import sys, json, requests, os, datetime, re, textwrap +# Ripped from https://stackoverflow.com/questions/26105659/how-to-convert-the-unicode-to-latin-characters-python/61551939#61551939 +from typing import Optional +import html, unicodedata + +class Config: + def __init__(self, cache_dir = None): + if "" == cache_dir or cache_dir is None: + try: + cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache + except: + pass + if "" == cache_dir or cache_dir is None: + try: + cache_dir = os.path.join(os.environ.get("HOME"),".cache") + except: + cache_dir = "/tmp" + #print(f"DEBUG(Config.init): cache_dir {cache_dir}") + self.cache_dir = cache_dir + +def normalize(value: str, encoding: Optional[str] = None) -> str: + """ + Normalize characters not maintainable when encode. + The default encoding is "ascii". + """ + if encoding is None: + return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + value = value.encode(encoding, 'backslashreplace').decode(encoding) + value = value.encode('ascii', 'xmlcharrefreplace').decode('unicode-escape') + value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') + return html.unescape(value) + store_urls = { "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106", - "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345" + "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345", + "food lion": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000a/widget.json?callback=jQuery111104817919592912373_1662672814198&_=1662672814199", + "lidl": "https://clipmunk.southernsavers.com/stores/59405bea724edc4175003366/widget.json?callback=jQuery111104720958887493587_1662672848590&_=1662672848591" } coupons_version = "2022-09-01a" -def fetch(store, force = False, date = None): +def fetch(store, force = False, date = None, config = None): """ Given a store name, visit the url and clean the json. If force, then update cached response.""" # Reference: # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq if store is None or store not in [f for f in store_urls]: print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr) return -1 + # validate date + today = datetime.datetime.today().date() + if date is None: + date = today.strftime("%F") + try: + if date is not None: + ddate = datetime.datetime.strptime(date,"%Y-%m-%d").date() + if ddate > today: + ddate = today + print(f"WARNING(fetch): date {date} is in the future. Using {today} instead.") + except: + ddate = today + print(f"WARNING(fetch): date {date} is invalid YYYY-MM-DD. Using {today} instead.") + try: + date = ddate.strftime("%F") + except: + pass + if date is not None and date != today.strftime("%F"): + print(f"DEBUG(fetch): using date {date}") # try to use cache at first contents = None if not force: contents = None - contents = get_cached_contents(store, date) # it is safe to return None + contents = get_cached_contents(store, date, config) # it is safe to return None # So if force == True, or the cache failed if contents is None or "" == contents: - print(f"INFO (fetch): no cached content, so visiting url",file=sys.stderr) + print(f"INFO (fetch): no cached content for {store},{date}, so visiting url",file=sys.stderr) try: url = store_urls[store.lower()] except: @@ -46,10 +98,11 @@ def fetch(store, force = False, date = None): r = requests.get(url) contents = r.text # try to save to cache, but it is not a blocker - try: - set_cached_contents(store, date, contents) - except: - pass + #try: + if True: + set_cached_contents(store, date, contents, config) + #except: + # pass return contents def clean(contents): @@ -60,18 +113,22 @@ def clean(contents): a = re.sub("\],\\\\n.*$","]",a) a = re.sub("\\\\\\\\[uU]003[eE]",">",a) a = re.sub("\\\\\\\\[uU]003[cC]","<",a) + a = re.sub("\\\\\\\\[uU]0026","&",a) + a = re.sub("\\\\\\\\[uU]201[cCdDeEfF]",'"',a) + a = re.sub("\\\\\\\\[uU]201[89aA]","'",a) contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a)) return contents -def get_cached_name(store, date = None): +def get_cached_name(store, date = None, config = None): """ Given store name, return cache filename regardless of existence or contents. """ USE_CACHE = True store = store.lower() - cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache - if "" == cache_dir or cache_dir is None: - cache_dir = os.path.join(os.environ.get("HOME"),".cache") + if config is None: + #print(f"DEBUG(get_cached_name): must generate new config") + config = Config() + cache_dir = config.cache_dir # use an app-specific dir underneath it cache_dir = os.path.join(cache_dir, "coupons") if not os.path.isdir(cache_dir): @@ -79,20 +136,24 @@ def get_cached_name(store, date = None): os.mkdir(cache_dir) except: # caching is not available; but this should not stop the program + print(f"INFO(get_cached_name): cannot create cache directory {cache_dir}.") USE_CACHE = False if USE_CACHE: if date is None: date = datetime.datetime.today().strftime("%F") cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json") - return cache_file + #print(f"DEBUG(get_cached_name): generated path is {cache_file}") + return cache_file + print(f"DEBUG(get_cached_name): no cache filename generated.") + return None -def get_cached_contents(store, date = None): +def get_cached_contents(store, date = None, config = None): """ Given store name, get cached contents Also, use today's date if not given a specific one. """ - cache_file = get_cached_name(store, date) - if os.path.exists(cache_file): + cache_file = get_cached_name(store, date, config) + if cache_file is not None and os.path.exists(cache_file): try: print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr) return open(cache_file,"r").read() @@ -100,14 +161,24 @@ def get_cached_contents(store, date = None): print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr) return None -def set_cached_contents(store, date = None, contents = None): +def set_cached_contents(store, date = None, contents = None, config = None): + """ + Write the large js+json payload to a cache file, if possible. This is low-priority. + """ if contents is None or "" == contents: return True # cache nothing so short-circuit + #print(f"DEBUG(set_cached_contents): contents length {len(contents)}") if date is None: date = datetime.datetime.today().strftime("%F") + #print(f"DEBUG(set_cached_contents): using date {date}") store = store.lower() - cache_file = get_cached_name(store, date) - open(cache_file,"w").write(contents) + cache_file = get_cached_name(store, date, config) + if cache_file is not None: + print(f"DEBUG(set_cached_contents): saving cache {cache_file} size {len(contents)}") + with open(cache_file,"w") as w: + # flatten weird characters into normal ones + #w.write(contents.encode('utf-8').decode('latin-1')) + w.write(normalize(contents)) def parse_coupons(inputobject, searchstring = None): """ @@ -126,7 +197,10 @@ def parse_coupons(inputobject, searchstring = None): #response[rgroup][rcat] = [] for i in c["items"]: text = i["html"] - if searchstring in text.lower(): + if "notes" in i and i["notes"] != "": + text = text + " <small>" + i["notes"] + "</small>" + #if searchstring in text.lower(): + if re.match(".*(" + searchstring.lower() + ").*", text.lower()): # only make this group and category if we have a match if rgroup not in response: response[rgroup] = {} @@ -135,9 +209,9 @@ def parse_coupons(inputobject, searchstring = None): response[rgroup][rcat].append(text) return(json.dumps(response)) -def fetch_and_search(store, force = False, date = None, searchstring = None): - """ Main usage of the whole library. """ - a = clean(fetch(store, force, date)) +def fetch_and_search(store, force = False, date = None, searchstring = None, config = None): + """ Main usage of the whole library for cli. """ + a = clean(fetch(store, force, date, config)) return parse_coupons(a, searchstring) if "__main__" == __name__: @@ -165,7 +239,7 @@ Basic usage: else: a = parse_coupons(sys.stdin.read(),args.search) else: - a = fetch_and_search(args.store,args.nocache,args.date,args.search) + a = fetch_and_search(args.store,args.nocache,args.date,args.search,config = None) if args.pretty: print(json.dumps(json.loads(a),indent=3)) else: |