coupons.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

#!/usr/bin/env python3
# File: coupons.py
# Location: .
# Author: bgstack15
# Startdate: 2022-08-31
# Title: Json Parser of Publix sales
# Project: coupons
# Purpose: Parse json for coupons that match lowercase string
# Usage:
#    called from check-sales.sh
#    Search with a lower-case string, against the lowercase values of coupon titles.
#       <input.json ./coupons.py --stdin --search 'candy'
# History:
#    I attempted to write similar logic with jq, but python is way easier
# Reference:
#    [internal] rod2/rod.py
# Improve:
# Documentation: README.md
import sys, json, requests, os, datetime, re, textwrap

store_urls = {
   "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106",
   "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345"
}
coupons_version = "2022-09-01a"

def fetch(store, force = False, date = None):
   """ Given a store name, visit the url and clean the json. If force, then update cached response."""
   # Reference:
   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
   if store is None or store not in [f for f in store_urls]:
      print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr)
      return -1
   # try to use cache at first
   contents = None
   if not force:
      contents = None
      contents = get_cached_contents(store, date) # it is safe to return None
   # So if force == True, or the cache failed
   if contents is None or "" == contents:
      print(f"INFO (fetch): no cached content, so visiting url",file=sys.stderr)
      try:
         url = store_urls[store.lower()]
      except:
         print(f"ERROR (fetch): no url saved for store {store}",file=sys.stderr)
      r = requests.get(url)
      contents = r.text
      # try to save to cache, but it is not a blocker
      try:
         set_cached_contents(store, date, contents)
      except:
         pass
   return contents

def clean(contents):
   """ Clean the javascript from southernsavers.com widget.json response. """
   # Reference:
   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
   a = re.sub("^.*lists: \\[","[",contents)
   a = re.sub("\],\\\\n.*$","]",a)
   a = re.sub("\\\\\\\\[uU]003[eE]",">",a)
   a = re.sub("\\\\\\\\[uU]003[cC]","<",a)
   contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a))
   return contents

def get_cached_name(store, date = None):
   """
   Given store name, return cache filename regardless of existence or contents.
   """
   USE_CACHE = True
   store = store.lower()
   cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache
   if "" == cache_dir or cache_dir is None:
      cache_dir = os.path.join(os.environ.get("HOME"),".cache")
   # use an app-specific dir underneath it
   cache_dir = os.path.join(cache_dir, "coupons")
   if not os.path.isdir(cache_dir):
      try:
         os.mkdir(cache_dir)
      except:
         # caching is not available; but this should not stop the program
         USE_CACHE = False
   if USE_CACHE:
      if date is None:
         date = datetime.datetime.today().strftime("%F")
      cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json")
   return cache_file

def get_cached_contents(store, date = None):
   """
   Given store name, get cached contents
   Also, use today's date if not given a specific one.
   """
   cache_file = get_cached_name(store, date)
   if os.path.exists(cache_file):
      try:
         print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr)
         return open(cache_file,"r").read()
      except:
         print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr)
   return None

def set_cached_contents(store, date = None, contents = None):
   if contents is None or "" == contents:
      return True # cache nothing so short-circuit
   if date is None:
      date = datetime.datetime.today().strftime("%F")
   store = store.lower()
   cache_file = get_cached_name(store, date)
   open(cache_file,"w").write(contents)

def parse_coupons(inputobject, searchstring = None):
   """
   Main logic to simplify the json down as well as return only results that match searchstring which should be lowercase.
   """
   a = inputobject
   b = json.loads(a)
   if searchstring is None:
      searchstring = ""
   response = {}
   for group in b:
      rgroup = group["name"]
      #response[rgroup] = {}
      for c in group["categories"]:
         rcat = c["name"]
         #response[rgroup][rcat] = []
         for i in c["items"]:
            text = i["html"]
            if searchstring in text.lower():
               # only make this group and category if we have a match
               if rgroup not in response:
                  response[rgroup] = {}
               if rcat not in response[rgroup]:
                  response[rgroup][rcat] = []
               response[rgroup][rcat].append(text)
   return(json.dumps(response))

def fetch_and_search(store, force = False, date = None, searchstring = None):
   """ Main usage of the whole library. """
   a = clean(fetch(store, force, date))
   return parse_coupons(a, searchstring)

if "__main__" == __name__:
   import argparse
   parser = argparse.ArgumentParser(prog = sys.argv[0], description = "Search currently listed sales/coupons on SouthernSavers.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""To use the cache file as standard input, run:
   <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty

Basic usage:
   ./coupons.py --store "publix" --search "candy"
"""))
   parser.add_argument("-n","--nocache","--nc","--no-cache", action = "store_true", help = "Skip the cache and always visit site.")
   parser.add_argument("-d", "--date", help = "Use different YYYY-MM-DD than today, for cache purposes. Does not affect visiting the website")
   parser.add_argument("-s","--search", help = "Search for items that match this, when converted to lowercase. Can leave blank to display all items")
   parser.add_argument("--store", help = f"Select sales from this store.", choices = [f for f in store_urls])
   parser.add_argument("--stdin", action = "store_true", help = "Pipe stdin to parse_coupons. Can still use --search")
   parser.add_argument("--clean", action = "store_true", help = "If using --stdin, also clean the whole javascript input into just the useful json part.")
   parser.add_argument("-p","--pretty","--prettyprint","--pretty-print", action = "store_true", help = "Pretty-print json output")
   parser.add_argument("-V|--version", action = "version", version = coupons_version)
   args = parser.parse_args()
   #print(args,file=sys.stderr)
   a = None
   if args.stdin:
      if args.clean:
         a = parse_coupons(clean(sys.stdin.read()),args.search)
      else:
         a = parse_coupons(sys.stdin.read(),args.search)
   else:
      a = fetch_and_search(args.store,args.nocache,args.date,args.search)
   if args.pretty:
      print(json.dumps(json.loads(a),indent=3))
   else:
      print(a)