From b13c9d59c64df1e06c5890895a44d3c3a538178e Mon Sep 17 00:00:00 2001
From: "B. Stack" <bgstack15@gmail.com>
Date: Thu, 1 Sep 2022 14:05:50 -0400
Subject: initial commit

---
 .gitignore  |   2 +
 README.md   |  35 +++++++++++++
 aux/ads1.py |  28 ++++++++++
 aux/notes   |  21 ++++++++
 coupons.py  | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 258 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 README.md
 create mode 100755 aux/ads1.py
 create mode 100644 aux/notes
 create mode 100755 coupons.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..b6b17d1
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+old
+__pycache__
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..36f4f99
--- /dev/null
+++ b/README.md
@@ -0,0 +1,35 @@
+# README for coupons
+This project exists to make it easy to query the current sale papers for select grocery stores, including specifically Publix.
+
+## Upstream
+This project's upstream is at <https://bgstack15.ddns.net/cgit/coupons>.
+
+## Alternatives
+Visiting <https://southernsavers.com> manually, or each store's website.
+
+I previously started using selenium (see [aux/ads1.py](aux/ads1.py)) but that was not necessary.
+
+## Reason for existence
+To simplify and automate searching for items currently on sale
+
+## Using
+Basic usage is pretty simple. You need to specify a store at a minimum. When the script visits the webpage, it will cache the file to `$XDG_CACHE_DIR/coupons/` to reduce the amount of work needed during the same day.
+
+    ./coupons.py --store "publix" --search "candy"
+
+To use standard input from a fully-cached javascript response from the website, you could run this.
+
+    <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty
+
+An already-cleaned json file would not need the **--clean** flag. But the cached values are the exact javascript+json payload from the server.
+
+See also `./coupons.py --help`.
+
+## Dependencies
+A chart for distros, or maybe just a simple package list.
+
+## Building or changing
+Only two stores are currently supported. The southernsavers.com website lists other stores that are probably drop-in capable. To learn the widgets.json path needed, use Developer Tools in a web browser to capture the full widgets.json path and add it to the **stores_url** dict.
+
+## References
+Developer Tools in Firefox
diff --git a/aux/ads1.py b/aux/ads1.py
new file mode 100755
index 0000000..9a35dd3
--- /dev/null
+++ b/aux/ads1.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+from pyvirtualdisplay import Display
+from selenium import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.keys import Keys
+from bs4 import BeautifulSoup
+import time, json, configparser, sys, os, argparse, textwrap
+from json import JSONEncoder
+from sys import argv
+
+display = Display(visible=0, size=(1024,768))
+display.start()
+
+def find_string(instring):
+   a = ""
+   with webdriver.Firefox() as browser:
+      browser.get("https://www.southernsavers.com/publix-weekly-ad-deals/")
+      #wait = WebDriverWait(browser, 8)
+      #wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "selectAllLink"))).click()
+      time.sleep(15) # wait 15 seconds for good measure
+      a = browser.page_source
+      #return browser.page_source
+      return a
+
+if __name__ == "__main__":
+   loop1()
diff --git a/aux/notes b/aux/notes
new file mode 100644
index 0000000..4a89b79
--- /dev/null
+++ b/aux/notes
@@ -0,0 +1,21 @@
+# startdate: 2022-08-31 21:49
+# Extracted from visiting https://www.southernsavers.com/publix-weekly-ad-deals/# in firefox
+# Ref:
+#    https://serverfault.com/questions/991982/jq-get-values-from-children-array-and-display-on-parent-array/991996#991996
+curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' > ~/foo34
+LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' ~/foo34 > ~/foo35
+<~/foo35 jq
+cl ; <~/foo35 jq '. as $input | $input[].categories[] | {name,items}'
+# find items where "Cracker" shows up in text of "html" tag.
+cl ; <~/foo35 jq '.[].categories[].items[] | select( .html | strings | test("Cracker")?)' 
+# all things in an easier format but not yet limited to "Cracker" search
+cl ; <~/foo35 jq '.[].categories[] as $cat | $cat | [del(.items,.id), (.items[] | { deal: .html }) ] | add'
+# does not do what i want
+cl ; <~/foo35 jq '.[] | [del(.id,.kind,.categories), (.categories[]|{ category: .name}), (.categories[].items[]|{html: .html}) ] | add'
+# instead of all this crazy jq above, use python to process and search
+<~/foo35 ./coupons.py 'zevia' | jq
+
+# all together:
+curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
+
+ingles url is https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345
diff --git a/coupons.py b/coupons.py
new file mode 100755
index 0000000..061f998
--- /dev/null
+++ b/coupons.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+# File: coupons.py
+# Location: .
+# Author: bgstack15
+# Startdate: 2022-08-31
+# Title: Json Parser of Publix sales
+# Project: coupons
+# Purpose: Parse json for coupons that match lowercase string
+# Usage:
+#    called from check-sales.sh
+#    Search with a lower-case string, against the lowercase values of coupon titles.
+#       <input.json ./coupons.py --stdin --search 'candy'
+# History:
+#    I attempted to write similar logic with jq, but python is way easier
+# Reference:
+#    [internal] rod2/rod.py
+# Improve:
+# Documentation: README.md
+import sys, json, requests, os, datetime, re, textwrap
+
+store_urls = {
+   "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106",
+   "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345"
+}
+coupons_version = "2022-09-01a"
+
+def fetch(store, force = False, date = None):
+   """ Given a store name, visit the url and clean the json. If force, then update cached response."""
+   # Reference:
+   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
+   if store is None or store not in [f for f in store_urls]:
+      print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr)
+      return -1
+   # try to use cache at first
+   contents = None
+   if not force:
+      contents = None
+      contents = get_cached_contents(store, date) # it is safe to return None
+   # So if force == True, or the cache failed
+   if contents is None or "" == contents:
+      print(f"INFO (fetch): no cached content, so visiting url",file=sys.stderr)
+      try:
+         url = store_urls[store.lower()]
+      except:
+         print(f"ERROR (fetch): no url saved for store {store}",file=sys.stderr)
+      r = requests.get(url)
+      contents = r.text
+      # try to save to cache, but it is not a blocker
+      try:
+         set_cached_contents(store, date, contents)
+      except:
+         pass
+   return contents
+
+def clean(contents):
+   """ Clean the javascript from southernsavers.com widget.json response. """
+   # Reference:
+   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
+   a = re.sub("^.*lists: \\[","[",contents)
+   a = re.sub("\],\\\\n.*$","]",a)
+   a = re.sub("\\\\\\\\[uU]003[eE]",">",a)
+   a = re.sub("\\\\\\\\[uU]003[cC]","<",a)
+   contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a))
+   return contents
+
+def get_cached_name(store, date = None):
+   """
+   Given store name, return cache filename regardless of existence or contents.
+   """
+   USE_CACHE = True
+   store = store.lower()
+   cache_dir = os.environ.get("XDG_CACHE_DIR") # defauls to ~/.cache
+   if "" == cache_dir or cache_dir is None:
+      cache_dir = os.path.join(os.environ.get("HOME"),".cache")
+   # use an app-specific dir underneath it
+   cache_dir = os.path.join(cache_dir, "coupons")
+   if not os.path.isdir(cache_dir):
+      try:
+         os.mkdir(cache_dir)
+      except:
+         # caching is not available; but this should not stop the program
+         USE_CACHE = False
+   if USE_CACHE:
+      if date is None:
+         date = datetime.datetime.today().strftime("%F")
+      cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json")
+   return cache_file
+
+def get_cached_contents(store, date = None):
+   """
+   Given store name, get cached contents
+   Also, use today's date if not given a specific one.
+   """
+   cache_file = get_cached_name(store, date)
+   if os.path.exists(cache_file):
+      try:
+         print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr)
+         return open(cache_file,"r").read()
+      except:
+         print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr)
+   return None
+
+def set_cached_contents(store, date = None, contents = None):
+   if contents is None or "" == contents:
+      return True # cache nothing so short-circuit
+   if date is None:
+      date = datetime.datetime.today().strftime("%F")
+   store = store.lower()
+   cache_file = get_cached_name(store, date)
+   open(cache_file,"w").write(contents)
+
+def parse_coupons(inputobject, searchstring = None):
+   """
+   Main logic to simplify the json down as well as return only results that match searchstring which should be lowercase.
+   """
+   a = inputobject
+   b = json.loads(a)
+   if searchstring is None:
+      searchstring = ""
+   response = {}
+   for group in b:
+      rgroup = group["name"]
+      #response[rgroup] = {}
+      for c in group["categories"]:
+         rcat = c["name"]
+         #response[rgroup][rcat] = []
+         for i in c["items"]:
+            text = i["html"]
+            if searchstring in text.lower():
+               # only make this group and category if we have a match
+               if rgroup not in response:
+                  response[rgroup] = {}
+               if rcat not in response[rgroup]:
+                  response[rgroup][rcat] = []
+               response[rgroup][rcat].append(text)
+   return(json.dumps(response))
+
+def fetch_and_search(store, force = False, date = None, searchstring = None):
+   """ Main usage of the whole library. """
+   a = clean(fetch(store, force, date))
+   return parse_coupons(a, searchstring)
+
+if "__main__" == __name__:
+   import argparse
+   parser = argparse.ArgumentParser(prog = sys.argv[0], description = "Search currently listed sales/coupons on SouthernSavers.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""To use the cache file as standard input, run:
+   <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty
+
+Basic usage:
+   ./coupons.py --store "publix" --search "candy"
+"""))
+   parser.add_argument("-n","--nocache","--nc","--no-cache", action = "store_true", help = "Skip the cache and always visit site.")
+   parser.add_argument("-d", "--date", help = "Use different YYYY-MM-DD than today, for cache purposes. Does not affect visiting the website")
+   parser.add_argument("-s","--search", help = "Search for items that match this, when converted to lowercase. Can leave blank to display all items")
+   parser.add_argument("--store", help = f"Select sales from this store.", choices = [f for f in store_urls])
+   parser.add_argument("--stdin", action = "store_true", help = "Pipe stdin to parse_coupons. Can still use --search")
+   parser.add_argument("--clean", action = "store_true", help = "If using --stdin, also clean the whole javascript input into just the useful json part.")
+   parser.add_argument("-p","--pretty","--prettyprint","--pretty-print", action = "store_true", help = "Pretty-print json output")
+   parser.add_argument("-V|--version", action = "version", version = coupons_version)
+   args = parser.parse_args()
+   #print(args,file=sys.stderr)
+   a = None
+   if args.stdin:
+      if args.clean:
+         a = parse_coupons(clean(sys.stdin.read()),args.search)
+      else:
+         a = parse_coupons(sys.stdin.read(),args.search)
+   else:
+      a = fetch_and_search(args.store,args.nocache,args.date,args.search)
+   if args.pretty:
+      print(json.dumps(json.loads(a),indent=3))
+   else:
+      print(a)
-- 
cgit