coupons.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256

#!/usr/bin/env python3
# File: coupons.py
# Location: .
# Author: bgstack15
# Startdate: 2022-08-31
# Title: Json Parser of Publix sales
# Project: coupons
# Purpose: Parse json for coupons that match lowercase string
# Usage:
#    called from check-sales.sh
#    Search with a lower-case string, against the lowercase values of coupon titles.
#       <input.json ./coupons.py --stdin --search 'candy'
# History:
#    I attempted to write similar logic with jq, but python is way easier
# Reference:
#    [internal] rod2/rod.py
# Improve:
# Documentation: README.md
import sys, json, requests, os, datetime, re, textwrap

# Ripped from https://stackoverflow.com/questions/26105659/how-to-convert-the-unicode-to-latin-characters-python/61551939#61551939
from typing import Optional
import html, unicodedata

class Config:
   def __init__(self, cache_dir = None):
      if "" == cache_dir or cache_dir is None:
         try:
            cache_dir = os.environ.get("XDG_CACHE_HOME") # defaults to ~/.cache
         except:
            pass
      if "" == cache_dir or cache_dir is None:
         try:
            cache_dir = os.path.join(os.environ.get("HOME"),".cache")
         except:
            cache_dir = "/tmp"
      #print(f"DEBUG(Config.init): cache_dir {cache_dir}")
      self.cache_dir = cache_dir

def normalize(value: str, encoding: Optional[str] = None) -> str:
    """
    Normalize characters not maintainable when encode.
    The default encoding is "ascii".
    """
    if encoding is None:
        return unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = value.encode(encoding, 'backslashreplace').decode(encoding)
    value = value.encode('ascii', 'xmlcharrefreplace').decode('unicode-escape')
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    return html.unescape(value)

store_urls = {
   "publix": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106",
   "ingles": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000f/widget.json?callback=jQuery11110011370202243518035_1662043118344&_=1662043118345",
   "food lion": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce000000a/widget.json?callback=jQuery111104817919592912373_1662672814198&_=1662672814199",
   "lidl": "https://clipmunk.southernsavers.com/stores/59405bea724edc4175003366/widget.json?callback=jQuery111104720958887493587_1662672848590&_=1662672848591",
   "lowes foods": "https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000011/widget.json?callback=jQuery11110619146055949265_1668430717483&_=1668430717484"
}
coupons_version = "2022-11-14a"

def fetch(store, force = False, date = None, config = None):
   """ Given a store name, visit the url and clean the json. If force, then update cached response."""
   # Reference:
   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
   if store is None or store not in [f for f in store_urls]:
      print(f"ERROR (fetch): store {store} not a valid option.",file=sys.stderr)
      return -1
   # validate date
   today = datetime.datetime.today().date()
   if date is None:
      date = today.strftime("%F")
   try:
      if date is not None:
         ddate = datetime.datetime.strptime(date,"%Y-%m-%d").date()
         if ddate > today:
            ddate = today
            print(f"WARNING(fetch): date {date} is in the future. Using {today} instead.")
   except:
      ddate = today
      print(f"WARNING(fetch): date {date} is invalid YYYY-MM-DD. Using {today} instead.")
   try:
      date = ddate.strftime("%F")
   except:
      pass
   if date is not None and date != today.strftime("%F"):
      print(f"DEBUG(fetch): using date {date}")
   # try to use cache at first
   contents = None
   if not force:
      contents = None
      contents = get_cached_contents(store, date, config) # it is safe to return None
   # So if force == True, or the cache failed
   if contents is None or "" == contents:
      print(f"INFO (fetch): no cached content for {store},{date}, so visiting url",file=sys.stderr)
      try:
         url = store_urls[store.lower()]
      except:
         print(f"ERROR (fetch): no url saved for store {store}",file=sys.stderr)
      r = requests.get(url)
      contents = r.text
      # try to save to cache, but it is not a blocker
      #try:
      if True:
         set_cached_contents(store, date, contents, config)
      #except:
      #   pass
   return contents

def clean(contents):
   """ Clean the javascript from southernsavers.com widget.json response. """
   # Reference:
   # curl 'https://clipmunk.southernsavers.com/stores/4f823db2090c841ce0000013/widget.json?callback=jQuery111106644488051860198_1661993569105&_=1661993569106' | LANG=C sed -r -e 's/\\\\[uU]003[eE]/>/g;' -e 's/\\\\[uU]003[cC]/</g;' -e 's/^.*lists: \[/\[/;' -e 's/\],\\n.*$/\]/;' -e 's/\\\\"/\\"/g;' -e 's/\\"/"/g;' | ./coupons.py 'candy' | jq
   a = re.sub("^.*lists: \\[","[",contents)
   a = re.sub("\],\\\\n.*$","]",a)
   a = re.sub("\\\\\\\\[uU]003[eE]",">",a)
   a = re.sub("\\\\\\\\[uU]003[cC]","<",a)
   a = re.sub("\\\\\\\\[uU]0026","&",a)
   a = re.sub("\\\\\\\\[uU]201[cCdDeEfF]",'"',a)
   a = re.sub("\\\\\\\\[uU]201[89aA]","'",a)
   contents = re.sub('\\\\"','"',re.sub('\\\\\\\\"','\\\\"',a))
   return contents

def get_cached_name(store, date = None, config = None):
   """
   Given store name, return cache filename regardless of existence or contents.
   """
   USE_CACHE = True
   store = store.lower()
   if config is None:
      #print(f"DEBUG(get_cached_name): must generate new config")
      config = Config()
   cache_dir = config.cache_dir
   # use an app-specific dir underneath it
   cache_dir = os.path.join(cache_dir, "coupons")
   if not os.path.isdir(cache_dir):
      try:
         os.mkdir(cache_dir)
      except:
         # caching is not available; but this should not stop the program
         print(f"INFO(get_cached_name): cannot create cache directory {cache_dir}.")
         USE_CACHE = False
   if USE_CACHE:
      if date is None:
         date = datetime.datetime.today().strftime("%F")
      cache_file = os.path.join(cache_dir,"_".join([store,date]) + ".json")
      #print(f"DEBUG(get_cached_name): generated path is {cache_file}")
      return cache_file
   print(f"DEBUG(get_cached_name): no cache filename generated.")
   return None

def get_cached_contents(store, date = None, config = None):
   """
   Given store name, get cached contents
   Also, use today's date if not given a specific one.
   """
   cache_file = get_cached_name(store, date, config)
   if cache_file is not None and os.path.exists(cache_file):
      try:
         print(f"INFO(get_cached_contents): using cache {cache_file}",file=sys.stderr)
         return open(cache_file,"r").read()
      except:
         print(f"INFO(get_cached_contents): unable to open existing cache file {cache_file}",file=sys.stderr)
   return None

def set_cached_contents(store, date = None, contents = None, config = None):
   """
   Write the large js+json payload to a cache file, if possible. This is low-priority.
   """
   if contents is None or "" == contents:
      return True # cache nothing so short-circuit
   #print(f"DEBUG(set_cached_contents): contents length {len(contents)}")
   if date is None:
      date = datetime.datetime.today().strftime("%F")
   #print(f"DEBUG(set_cached_contents): using date {date}")
   store = store.lower()
   cache_file = get_cached_name(store, date, config)
   if cache_file is not None:
      print(f"DEBUG(set_cached_contents): saving cache {cache_file} size {len(contents)}")
      with open(cache_file,"w") as w:
         # flatten weird characters into normal ones
         #w.write(contents.encode('utf-8').decode('latin-1'))
         w.write(normalize(contents))

def parse_coupons(inputobject, searchstring = None):
   """
   Main logic to simplify the json down as well as return only results that match searchstring which should be lowercase.
   """
   a = inputobject
   b = json.loads(a)
   if searchstring is None:
      searchstring = ""
   response = {}
   for group in b:
      rgroup = group["name"]
      #response[rgroup] = {}
      for c in group["categories"]:
         rcat = c["name"]
         #response[rgroup][rcat] = []
         for i in c["items"]:
            print(f"item = {i}")
            try:
               text = i["html"]
            except:
               text = i["name"]
               add_text = ""
               for h in i["items"] if "items" in i else []:
                  add_text += "<li>" + str(h["html"] if "html" in h else "") + str(h["notes"] if "notes" in h else "") + "</li>"
               if "" != add_text:
                  text += "<ul>" + add_text + "</ul>"
            if "notes" in i and i["notes"] != "":
               text = text + " <small>" + i["notes"] + "</small>"
            #if searchstring in text.lower():
            if re.match(".*(" + searchstring.lower() + ").*", text.lower()):
               # only make this group and category if we have a match
               if rgroup not in response:
                  response[rgroup] = {}
               if rcat not in response[rgroup]:
                  response[rgroup][rcat] = []
               response[rgroup][rcat].append(text)
   return(json.dumps(response))

def fetch_and_search(store, force = False, date = None, searchstring = None, config = None):
   """ Main usage of the whole library for cli. """
   a = clean(fetch(store, force, date, config))
   return parse_coupons(a, searchstring)

if "__main__" == __name__:
   import argparse
   parser = argparse.ArgumentParser(prog = sys.argv[0], description = "Search currently listed sales/coupons on SouthernSavers.com", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""To use the cache file as standard input, run:
   <~/.cache/coupons/publix_2022-09-01.json ./coupons.py --stdin --clean --search "candy" --pretty

Basic usage:
   ./coupons.py --store "publix" --search "candy"
"""))
   parser.add_argument("-n","--nocache","--nc","--no-cache", action = "store_true", help = "Skip the cache and always visit site.")
   parser.add_argument("-d", "--date", help = "Use different YYYY-MM-DD than today, for cache purposes. Does not affect visiting the website")
   parser.add_argument("-s","--search", help = "Search for items that match this, when converted to lowercase. Can leave blank to display all items")
   parser.add_argument("--store", help = f"Select sales from this store.", choices = [f for f in store_urls])
   parser.add_argument("--stdin", action = "store_true", help = "Pipe stdin to parse_coupons. Can still use --search")
   parser.add_argument("--clean", action = "store_true", help = "If using --stdin, also clean the whole javascript input into just the useful json part.")
   parser.add_argument("-p","--pretty","--prettyprint","--pretty-print", action = "store_true", help = "Pretty-print json output")
   parser.add_argument("-V|--version", action = "version", version = coupons_version)
   args = parser.parse_args()
   #print(args,file=sys.stderr)
   a = None
   if args.stdin:
      if args.clean:
         a = parse_coupons(clean(sys.stdin.read()),args.search)
      else:
         a = parse_coupons(sys.stdin.read(),args.search)
   else:
      a = fetch_and_search(args.store,args.nocache,args.date,args.search,config = None)
   if args.pretty:
      print(json.dumps(json.loads(a),indent=3))
   else:
      print(a)