#!/usr/bin/env python3 # Startdate: 2020-05-30 19:30 # Purpose: remove key, useless html elements from slurped pages from bs4 import BeautifulSoup import sys def remove_useless(contents): soup = BeautifulSoup(contents,"html.parser") try: sidebar = soup.find(class_="nav-sidebar") sidebar.replace_with("") except: pass try: navbar = soup.find(class_="navbar-gitlab") navbar.replace_with("") except: pass try: rightbar = soup.find(class_="issuable-context-form") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="js-issuable-sidebar") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="js-issuable-actions") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="js-noteable-awards") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="disabled-comment") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="notes-form") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="btn-edit") rightbar.replace_with("") except: pass try: rightbar = soup.find(class_="js-issuable-edit") rightbar.replace_with("") except: pass try: mylist = soup.find_all(class_="note-actions") for i in mylist: i.replace_with("") except: pass try: mylist = soup.find_all(class_="emoji-block") for i in mylist: i.replace_with("") except: pass try: mylist = soup.find_all(class_="broadcast-message") for i in mylist: i.replace_with("") except: pass return soup # this works, for the single file called #with open(sys.argv[1],"r") as infile: # lines = infile.read() with open("output/files-for-timestamps.txt") as f: lines = [line.rstrip() for line in f] for thisfile in lines: print("Removing useless html in file",thisfile) with open(thisfile) as tf: output=remove_useless(tf.read()) with open(thisfile,"w",encoding='utf-8') as tf: tf.write(str(output.prettify()))