diff options
author | B. Stack <bgstack15@gmail.com> | 2022-09-01 14:05:50 -0400 |
---|---|---|
committer | B. Stack <bgstack15@gmail.com> | 2022-09-01 14:05:50 -0400 |
commit | b13c9d59c64df1e06c5890895a44d3c3a538178e (patch) | |
tree | 9bc9a44bfe9eda7396a91fbdfe326f1d4777caf8 | |
download | coupons-b13c9d59c64df1e06c5890895a44d3c3a538178e.tar.gz coupons-b13c9d59c64df1e06c5890895a44d3c3a538178e.tar.bz2 coupons-b13c9d59c64df1e06c5890895a44d3c3a538178e.zip |
initial commit
-rw-r--r-- | .gitignore | 2 | ||||
-rw-r--r-- | README.md | 35 | ||||
-rwxr-xr-x | aux/ads1.py | 28 | ||||
-rw-r--r-- | aux/notes | 21 | ||||
-rwxr-xr-x | coupons.py | 172 |
5 files changed, 258 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b6b17d1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +old +__pycache__ diff --git a/README.md b/README.md new file mode 100644 index 0000000..36f4f99 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +# README for coupons +This project exists to make it easy to query the current sale papers for select grocery stores, including specifically Publix. + +## Upstream +This project's upstream is at <https://bgstack15.ddns.net/cgit/coupons>. + +## Alternatives +Visiting <https://southernsavers.com> manually, or each store's website. + +I previously started using selenium (see [aux/ads1.py](aux/ads1.py)) but that was not necessary. + +## Reason for existence +To simplify and automate searching for items currently on sale + +## Using +Basic usage is pretty simple. You need to specify a store at a minimum. When the script visits the webpage, it will cache the file to `$XDG_CACHE_DIR/coupons/` to reduce the amount of work needed during the same day. + + ./coupons.py --store "publix" --search "candy" + +To use standard input from a fully-cached javascript response from the website, you could run this. + + <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty + +An already-cleaned json file would not need the **--clean** flag. But the cached values are the exact javascript+json payload from the server. + +See also `./coupons.py --help`. + +## Dependencies +A chart for distros, or maybe just a simple package list. + +## Building or changing +Only two stores are currently supported. The southernsavers.com website lists other stores that are probably drop-in capable. To learn the widgets.json path needed, use Developer Tools in a web browser to capture the full widgets.json path and add it to the **stores_url** dict. + +## References +Developer Tools in Firefox diff --git a/aux/ads1.py b/aux/ads1.py new file mode 100755 index 0000000..9a35dd3 --- /dev/null +++ b/aux/ads1.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 +from pyvirtualdisplay import Display +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.common.by import By +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.keys import Keys +from bs4 import BeautifulSoup +import time, json, configparser, sys, os, argparse, textwrap +from json import JSONEncoder +from sys import argv + +display = Display(visible=0, size=(1024,768)) +display.start() + +def find_string(instring): + a = "" + with webdriver.Firefox() as browser: + browser.get("https://www.southernsavers.com/publix-weekly-ad-deals/") + #wait = WebDriverWait(browser, 8) + #wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "selectAllLink"))).click() + time.sleep(15) # wait 15 seconds for good measure + a = browser.page_source + #return browser.page_source + return a + +if __name__ == "__main__": + loop1() diff --git a/aux/notes b/aux/notes new file mode 100644 index 0000000..4a89b79 --- /dev/null +++ b/aux/notes @@ -0,0 +1,21 @@ +# startdate: 2022-08-31 21:49 +# Extracted from visiting https://www.southernsavers.com/publix-weekly-ad-deals/# in firefox +# Ref: +# https://serverfault.com/questions/991982/jq-get-values-from-children-array-and-display-on-parent-array/991996#991996 +curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' > ~/foo34 +LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' ~/foo34 > ~/foo35 +<~/foo35 jq +cl ; <~/foo35 jq '. as $input | $input[].categories[] | {name,items}' +# find items where "Cracker" shows up in text of "html" tag. +cl ; <~/foo35 jq '.[].categories[].items[] | select( .html | strings | test("Cracker")?)' +# all things in an easier format but not yet limited to "Cracker" search +cl ; <~/foo35 jq '.[].categories[] as $cat | $cat | [del(.items,.id), (.items[] | { deal: .html }) ] | add' +# does not do what i want +cl ; <~/foo35 jq '.[] | [del(.id,.kind,.categories), (.categories[]|{ category: .name}), (.categories[].items[]|{html: .html}) ] | add' +# instead of all this crazy jq above, use python to process and search +<~/foo35 ./coupons.py 'zevia' | jq + +# all together: +curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq + +ingles url is https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345 diff --git a/coupons.py b/coupons.py new file mode 100755 index 0000000..061f998 --- /dev/null +++ b/coupons.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python3 +# File: coupons.py +# Location: . +# Author: bgstack15 +# Startdate: 2022-08-31 +# Title: Json Parser of Publix sales +# Project: coupons +# Purpose: Parse json for coupons that match lowercase string +# Usage: +# called from check-sales.sh +# Search with a lower-case string, against the lowercase values of coupon titles. +# <input.json ./coupons.py --stdin --search 'candy' +# History: +# I attempted to write similar logic with jq, but python is way easier +# Reference: +# [internal] rod2/rod.py +# Improve: +# Documentation: README.md +import sys, json, requests, os, datetime, re, textwrap + +store_urls = { + "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106", + "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345" +} +coupons_version = "2022-09-01a" + +def fetch(store, force = False, date = None): + """ Given a store name, visit the url and clean the json. If force, then update cached response.""" + # Reference: + # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq + if store is None or store not in [f for f in store_urls]: + print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr) + return -1 + # try to use cache at first + contents = None + if not force: + contents = None + contents = get_cached_contents(store, date) # it is safe to return None + # So if force == True, or the cache failed + if contents is None or "" == contents: + print(f"INFO (fetch): no cached content, so visiting url",file=sys.stderr) + try: + url = store_urls[store.lower()] + except: + print(f"ERROR (fetch): no url saved for store {store}",file=sys.stderr) + r = requests.get(url) + contents = r.text + # try to save to cache, but it is not a blocker + try: + set_cached_contents(store, date, contents) + except: + pass + return contents + +def clean(contents): + """ Clean the javascript from southernsavers.com widget.json response. """ + # Reference: + # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq + a = re.sub("^.*lists: \\[","[",contents) + a = re.sub("\],\\\\n.*$","]",a) + a = re.sub("\\\\\\\\[uU]003[eE]",">",a) + a = re.sub("\\\\\\\\[uU]003[cC]","<",a) + contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a)) + return contents + +def get_cached_name(store, date = None): + """ + Given store name, return cache filename regardless of existence or contents. + """ + USE_CACHE = True + store = store.lower() + cache_dir = os.environ.get("XDG_CACHE_DIR") # defauls to ~/.cache + if "" == cache_dir or cache_dir is None: + cache_dir = os.path.join(os.environ.get("HOME"),".cache") + # use an app-specific dir underneath it + cache_dir = os.path.join(cache_dir, "coupons") + if not os.path.isdir(cache_dir): + try: + os.mkdir(cache_dir) + except: + # caching is not available; but this should not stop the program + USE_CACHE = False + if USE_CACHE: + if date is None: + date = datetime.datetime.today().strftime("%F") + cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json") + return cache_file + +def get_cached_contents(store, date = None): + """ + Given store name, get cached contents + Also, use today's date if not given a specific one. + """ + cache_file = get_cached_name(store, date) + if os.path.exists(cache_file): + try: + print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr) + return open(cache_file,"r").read() + except: + print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr) + return None + +def set_cached_contents(store, date = None, contents = None): + if contents is None or "" == contents: + return True # cache nothing so short-circuit + if date is None: + date = datetime.datetime.today().strftime("%F") + store = store.lower() + cache_file = get_cached_name(store, date) + open(cache_file,"w").write(contents) + +def parse_coupons(inputobject, searchstring = None): + """ + Main logic to simplify the json down as well as return only results that match searchstring which should be lowercase. + """ + a = inputobject + b = json.loads(a) + if searchstring is None: + searchstring = "" + response = {} + for group in b: + rgroup = group["name"] + #response[rgroup] = {} + for c in group["categories"]: + rcat = c["name"] + #response[rgroup][rcat] = [] + for i in c["items"]: + text = i["html"] + if searchstring in text.lower(): + # only make this group and category if we have a match + if rgroup not in response: + response[rgroup] = {} + if rcat not in response[rgroup]: + response[rgroup][rcat] = [] + response[rgroup][rcat].append(text) + return(json.dumps(response)) + +def fetch_and_search(store, force = False, date = None, searchstring = None): + """ Main usage of the whole library. """ + a = clean(fetch(store, force, date)) + return parse_coupons(a, searchstring) + +if "__main__" == __name__: + import argparse + parser = argparse.ArgumentParser(prog = sys.argv[0], description = "Search currently listed sales/coupons on SouthernSavers.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""To use the cache file as standard input, run: + <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty + +Basic usage: + ./coupons.py --store "publix" --search "candy" +""")) + parser.add_argument("-n","--nocache","--nc","--no-cache", action = "store_true", help = "Skip the cache and always visit site.") + parser.add_argument("-d", "--date", help = "Use different YYYY-MM-DD than today, for cache purposes. Does not affect visiting the website") + parser.add_argument("-s","--search", help = "Search for items that match this, when converted to lowercase. Can leave blank to display all items") + parser.add_argument("--store", help = f"Select sales from this store.", choices = [f for f in store_urls]) + parser.add_argument("--stdin", action = "store_true", help = "Pipe stdin to parse_coupons. Can still use --search") + parser.add_argument("--clean", action = "store_true", help = "If using --stdin, also clean the whole javascript input into just the useful json part.") + parser.add_argument("-p","--pretty","--prettyprint","--pretty-print", action = "store_true", help = "Pretty-print json output") + parser.add_argument("-V|--version", action = "version", version = coupons_version) + args = parser.parse_args() + #print(args,file=sys.stderr) + a = None + if args.stdin: + if args.clean: + a = parse_coupons(clean(sys.stdin.read()),args.search) + else: + a = parse_coupons(sys.stdin.read(),args.search) + else: + a = fetch_and_search(args.store,args.nocache,args.date,args.search) + if args.pretty: + print(json.dumps(json.loads(a),indent=3)) + else: + print(a) |