#!/usr/bin/env python3 # File: coupons.py # Location: . # Author: bgstack15 # Startdate: 2022-08-31 # Title: Json Parser of Publix sales # Project: coupons # Purpose: Parse json for coupons that match lowercase string # Usage: # called from check-sales.sh # Search with a lower-case string, against the lowercase values of coupon titles. # str: """ Normalize characters not maintainable when encode. The default encoding is "ascii". """ if encoding is None: return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') value = value.encode(encoding, 'backslashreplace').decode(encoding) value = value.encode('ascii', 'xmlcharrefreplace').decode('unicode-escape') value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') return html.unescape(value) store_urls = { "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106", "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345", "food lion": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000a/widget.json?callback=jQuery111104817919592912373_1662672814198&_=1662672814199", "lidl": "https://clipmunk.southernsavers.com/stores/59405bea724edc4175003366/widget.json?callback=jQuery111104720958887493587_1662672848590&_=1662672848591" } coupons_version = "2022-09-01a" def fetch(store, force = False, date = None, config = None): """ Given a store name, visit the url and clean the json. If force, then update cached response.""" # Reference: # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/ today: ddate = today print(f"WARNING(fetch): date {date} is in the future. Using {today} instead.") except: ddate = today print(f"WARNING(fetch): date {date} is invalid YYYY-MM-DD. Using {today} instead.") try: date = ddate.strftime("%F") except: pass if date is not None and date != today.strftime("%F"): print(f"DEBUG(fetch): using date {date}") # try to use cache at first contents = None if not force: contents = None contents = get_cached_contents(store, date, config) # it is safe to return None # So if force == True, or the cache failed if contents is None or "" == contents: print(f"INFO (fetch): no cached content for {store},{date}, so visiting url",file=sys.stderr) try: url = store_urls[store.lower()] except: print(f"ERROR (fetch): no url saved for store {store}",file=sys.stderr) r = requests.get(url) contents = r.text # try to save to cache, but it is not a blocker #try: if True: set_cached_contents(store, date, contents, config) #except: # pass return contents def clean(contents): """ Clean the javascript from southernsavers.com widget.json response. """ # Reference: # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/",a) a = re.sub("\\\\\\\\[uU]003[cC]","<",a) a = re.sub("\\\\\\\\[uU]0026","&",a) a = re.sub("\\\\\\\\[uU]201[cCdDeEfF]",'"',a) a = re.sub("\\\\\\\\[uU]201[89aA]","'",a) contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a)) return contents def get_cached_name(store, date = None, config = None): """ Given store name, return cache filename regardless of existence or contents. """ USE_CACHE = True store = store.lower() if config is None: #print(f"DEBUG(get_cached_name): must generate new config") config = Config() cache_dir = config.cache_dir # use an app-specific dir underneath it cache_dir = os.path.join(cache_dir, "coupons") if not os.path.isdir(cache_dir): try: os.mkdir(cache_dir) except: # caching is not available; but this should not stop the program print(f"INFO(get_cached_name): cannot create cache directory {cache_dir}.") USE_CACHE = False if USE_CACHE: if date is None: date = datetime.datetime.today().strftime("%F") cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json") #print(f"DEBUG(get_cached_name): generated path is {cache_file}") return cache_file print(f"DEBUG(get_cached_name): no cache filename generated.") return None def get_cached_contents(store, date = None, config = None): """ Given store name, get cached contents Also, use today's date if not given a specific one. """ cache_file = get_cached_name(store, date, config) if cache_file is not None and os.path.exists(cache_file): try: print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr) return open(cache_file,"r").read() except: print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr) return None def set_cached_contents(store, date = None, contents = None, config = None): """ Write the large js+json payload to a cache file, if possible. This is low-priority. """ if contents is None or "" == contents: return True # cache nothing so short-circuit #print(f"DEBUG(set_cached_contents): contents length {len(contents)}") if date is None: date = datetime.datetime.today().strftime("%F") #print(f"DEBUG(set_cached_contents): using date {date}") store = store.lower() cache_file = get_cached_name(store, date, config) if cache_file is not None: print(f"DEBUG(set_cached_contents): saving cache {cache_file} size {len(contents)}") with open(cache_file,"w") as w: # flatten weird characters into normal ones #w.write(contents.encode('utf-8').decode('latin-1')) w.write(normalize(contents)) def parse_coupons(inputobject, searchstring = None): """ Main logic to simplify the json down as well as return only results that match searchstring which should be lowercase. """ a = inputobject b = json.loads(a) if searchstring is None: searchstring = "" response = {} for group in b: rgroup = group["name"] #response[rgroup] = {} for c in group["categories"]: rcat = c["name"] #response[rgroup][rcat] = [] for i in c["items"]: print(f"item = {i}") try: text = i["html"] except: text = i["name"] add_text = "" for h in i["items"] if "items" in i else []: add_text += "
  • " + str(h["html"] if "html" in h else "") + str(h["notes"] if "notes" in h else "") + "
  • " if "" != add_text: text += "" if "notes" in i and i["notes"] != "": text = text + " " + i["notes"] + "" #if searchstring in text.lower(): if re.match(".*(" + searchstring.lower() + ").*", text.lower()): # only make this group and category if we have a match if rgroup not in response: response[rgroup] = {} if rcat not in response[rgroup]: response[rgroup][rcat] = [] response[rgroup][rcat].append(text) return(json.dumps(response)) def fetch_and_search(store, force = False, date = None, searchstring = None, config = None): """ Main usage of the whole library for cli. """ a = clean(fetch(store, force, date, config)) return parse_coupons(a, searchstring) if "__main__" == __name__: import argparse parser = argparse.ArgumentParser(prog = sys.argv[0], description = "Search currently listed sales/coupons on SouthernSavers.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""To use the cache file as standard input, run: <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty Basic usage: ./coupons.py --store "publix" --search "candy" """)) parser.add_argument("-n","--nocache","--nc","--no-cache", action = "store_true", help = "Skip the cache and always visit site.") parser.add_argument("-d", "--date", help = "Use different YYYY-MM-DD than today, for cache purposes. Does not affect visiting the website") parser.add_argument("-s","--search", help = "Search for items that match this, when converted to lowercase. Can leave blank to display all items") parser.add_argument("--store", help = f"Select sales from this store.", choices = [f for f in store_urls]) parser.add_argument("--stdin", action = "store_true", help = "Pipe stdin to parse_coupons. Can still use --search") parser.add_argument("--clean", action = "store_true", help = "If using --stdin, also clean the whole javascript input into just the useful json part.") parser.add_argument("-p","--pretty","--prettyprint","--pretty-print", action = "store_true", help = "Pretty-print json output") parser.add_argument("-V|--version", action = "version", version = coupons_version) args = parser.parse_args() #print(args,file=sys.stderr) a = None if args.stdin: if args.clean: a = parse_coupons(clean(sys.stdin.read()),args.search) else: a = parse_coupons(sys.stdin.read(),args.search) else: a = fetch_and_search(args.store,args.nocache,args.date,args.search,config = None) if args.pretty: print(json.dumps(json.loads(a),indent=3)) else: print(a)