summaryrefslogtreecommitdiff
path: root/fix-timestamps.py
blob: a5a44414931349b21f9f85f2b716415b3e7e5fb8 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
#!/usr/bin/env python3
# Startdate: 2020-05-29 20:40
# Purpose: convert timestamps on gitlab issue web page into UTC
# History:
#    2020-05-30 09:24 add loop through files listed in output/files-for-timestamps.txt
# Usage:
#    ls -1 /mnt/public/www/gitlab-issues/*.html > output/files-for-timestamps.txt
#    ./fix-timestamps.py
# References:
#    https://www.crummy.com/software/BeautifulSoup/bs4/doc/#pretty-printing
#    https://gitlab.com/bgstack15/vooblystats/-/blob/master/vooblystats.py
#    https://bgstack15.wordpress.com/2020/02/16/python3-convert-relative-date-to-utc-timestamp/
# Improve:
#    this is hardcoded to work when the pages are shown in EDT.
from bs4 import BeautifulSoup
from datetime import timedelta
from parsedatetime import Calendar
from pytz import timezone 

def fix_timestamps(page_text):
   soup = BeautifulSoup(page_text,"html.parser")
   cal = Calendar()
   x = 0
   for i in soup.find_all(name='time'):
      x = x + 1
      j = i.attrs["data-original-title"]
      if 'EDT' == j[-3:] or 'EST' == j[-3:]:
         tzobject=timezone("US/Eastern")
      else:
         tzobject=timezone("UTC")
      dto, _ = cal.parseDT(datetimeString=j,tzinfo=timezone("US/Eastern"))
      add_hours = int((str(dto)[-6:])[:3])
      j = (timedelta(hours=-add_hours) + dto).strftime('%Y-%m-%dT%H:%MZ')
      # second precision %S is not needed for this use case.
      i.string = j
   return soup

with open("output/files-for-timestamps.txt") as f:
   lines = [line.rstrip() for line in f]

for thisfile in lines:
   print("Fixing timestamps in file",thisfile)
   with open(thisfile) as tf:
      output=fix_timestamps(tf.read())
   with open(thisfile,"w",encoding='utf-8') as tf:
      tf.write(str(output.prettify()))
bgstack15