diff options
Diffstat (limited to 'fetch-issue-webpages.py')
-rwxr-xr-x | fetch-issue-webpages.py | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/fetch-issue-webpages.py b/fetch-issue-webpages.py new file mode 100755 index 0000000..86d6b71 --- /dev/null +++ b/fetch-issue-webpages.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +# Startdate: 2020-05-29 16:22 +# History: +# Usage: +# ln -s issues.all.web_url output/files-to-fetch.txt +# ./fetch-issues-webpages.py +# How to make this work: +# apt-get install python3-pyvirtualdisplay +# download this geckodriver, place in /usr/local/bin +# References: +# basic guide https://web.archive.org/web/20191031110759/http://scraping.pro/use-headless-firefox-scraping-linux/ +# https://stackoverflow.com/questions/40302006/no-such-file-or-directory-geckodriver-for-a-python-simple-selenium-applicatio +# geckodriver https://github.com/mozilla/geckodriver/releases/download/v0.24.0/geckodriver-v0.24.0-linux64.tar.gz +# https://www.selenium.dev/selenium/docs/api/py/index.html?highlight=get +# page source https://www.selenium.dev/selenium/docs/api/py/webdriver_remote/selenium.webdriver.remote.webdriver.html?highlight=title#selenium.webdriver.remote.webdriver.WebDriver.title +# make sure all comments load https://stackoverflow.com/questions/26566799/wait-until-page-is-loaded-with-selenium-webdriver-for-python/44998503#44998503 +# https://crossbrowsertesting.com/blog/test-automation/automate-login-with-selenium/ +# Improve: +from pyvirtualdisplay import Display +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +import re, time, getpass + +def ask_password(prompt): + #return input(prompt+": ") + return getpass.getpass(prompt+": ") + +def scrollDown(driver, value): + driver.execute_script("window.scrollBy(0,"+str(value)+")") + +# Scroll down the page +def scrollDownAllTheWay(driver): + old_page = driver.page_source + while True: + #logging.debug("Scrolling loop") + for i in range(2): + scrollDown(driver, 500) + time.sleep(2) + new_page = driver.page_source + if new_page != old_page: + old_page = new_page + else: + break + return True + +server_string="https://git.devuan.org" +outdir="/mnt/public/www/issues" + +with open("output/files-to-fetch.txt") as f: + lines=[line.rstrip() for line in f] + +# ask password now instead of after the delay +password = ask_password("Enter password for "+server_string) + +display = Display(visible=0, size=(800, 600)) +display.start() + +browser = webdriver.Firefox() + +# log in to gitlab instance +browser.get(server_string+"/users/sign_in") +browser.find_element_by_id("user_login").send_keys('bgstack15') +browser.find_element_by_id("user_password").send_keys(password) +browser.find_element_by_class_name("qa-sign-in-button").click() +browser.get(server_string+"/profile") # always needs the authentication +scrollDownAllTheWay(browser) + +for thisfile in lines: + destfile=re.sub("\.+",".",re.sub("\/|issues",".",re.sub("^"+re.escape(server_string)+"\/","",thisfile)))+".html" + print("Saving",thisfile,outdir+"/"+destfile) + browser.get(thisfile) + scrollDownAllTheWay(browser) + with open(outdir+"/"+destfile,"w") as text_file: + print(browser.page_source.encode('utf-8'),file=text_file) + +# done with loop +browser.quit() +display.stop() |