diff options
-rwxr-xr-x | source/auth.py | 269 | ||||
-rw-r--r-- | source/binarytree.py | 177 | ||||
-rw-r--r-- | source/conf.py | 57 | ||||
-rw-r--r-- | source/export.py | 274 | ||||
-rwxr-xr-x | source/feedgetter.py | 231 | ||||
-rwxr-xr-x | source/log.py | 67 | ||||
-rw-r--r-- | source/mongodb.py | 283 | ||||
-rwxr-xr-x | source/pyAggr3g470r | 143 | ||||
-rwxr-xr-x | source/pyAggr3g470r.py | 715 | ||||
-rw-r--r-- | source/search.py | 129 | ||||
-rw-r--r-- | source/testbinarytree.py | 45 | ||||
-rwxr-xr-x | source/utils.py | 317 |
12 files changed, 0 insertions, 2707 deletions
diff --git a/source/auth.py b/source/auth.py deleted file mode 100755 index 82c3a440..00000000 --- a/source/auth.py +++ /dev/null @@ -1,269 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.3 $" -__date__ = "$Date: 2012/10/12 $" -__revision__ = "$Date: 2013/01/14 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -# -# Form based authentication for CherryPy. Requires the -# Session tool to be loaded. -# - -import cherrypy -import hashlib - -import log - -SESSION_KEY = '_cp_username' - -import csv -class excel_french(csv.Dialect): - delimiter = ';' - quotechar = '"' - doublequote = True - skipinitialspace = False - lineterminator = '\n' - quoting = csv.QUOTE_MINIMAL - -csv.register_dialect('excel_french', excel_french) - -def change_username(username, new_username, password_file='./var/password'): - """ - Change the password corresponding to username. - """ - users_list = [] - result = False - with open(password_file, 'r') as csv_readfile_read: - cr = csv.reader(csv_readfile_read, 'excel_french') - users_list = [elem for elem in cr] - with open(password_file, 'w') as csv_file_write: - cw = csv.writer(csv_file_write, 'excel_french') - for user in users_list: - if user[0] == username: - cw.writerow([new_username, user[1]]) - result = True - else: - cw.writerow(user) - return result - -def change_password(username, new_password, password_file='./var/password'): - """ - Change the password corresponding to username. - """ - users_list = [] - result = False - with open(password_file, 'r') as csv_readfile_read: - cr = csv.reader(csv_readfile_read, 'excel_french') - users_list = [elem for elem in cr] - with open(password_file, 'w') as csv_file_write: - cw = csv.writer(csv_file_write, 'excel_french') - for user in users_list: - if user[0] == username: - m = hashlib.sha1() - m.update(new_password.encode()) - cw.writerow([user[0], m.hexdigest()]) - result = True - else: - cw.writerow(user) - return result - -def check_credentials(username, password, password_file='./var/password'): - """ - Verifies credentials for username and password. - Returns None on success or a string describing the error on failure. - """ - USERS = {} - cr = csv.reader(open(password_file, "r"), 'excel_french') - for row in cr: - USERS[row[0]] = row[1] - - m = hashlib.sha1() - m.update(password.encode()) - if username in list(USERS.keys()) and USERS[username] == m.hexdigest(): - return None - else: - return "Incorrect username or password." - # An example implementation which uses an ORM could be: - # u = User.get(username) - # if u is None: - # return u"Username %s is unknown to me." % username - # if u.password != md5.new(password).hexdigest(): - # return u"Incorrect password" - -def check_auth(*args, **kwargs): - """ - A tool that looks in config for 'auth.require'. If found and it - is not None, a login is required and the entry is evaluated as a list of - conditions that the user must fulfill. - """ - conditions = cherrypy.request.config.get('auth.require', None) - if conditions is not None: - username = cherrypy.session.get(SESSION_KEY) - if username: - cherrypy.request.login = username - for condition in conditions: - # A condition is just a callable that returns true or false - if not condition(): - raise cherrypy.HTTPRedirect("/auth/login") - else: - raise cherrypy.HTTPRedirect("/auth/login") - -cherrypy.tools.auth = cherrypy.Tool('before_handler', check_auth) - -def require(*conditions): - """ - A decorator that appends conditions to the auth.require config - variable. - """ - def decorate(f): - if not hasattr(f, '_cp_config'): - f._cp_config = dict() - if 'auth.require' not in f._cp_config: - f._cp_config['auth.require'] = [] - f._cp_config['auth.require'].extend(conditions) - return f - return decorate - - -# Conditions are callables that return True -# if the user fulfills the conditions they define, False otherwise -# -# They can access the current username as cherrypy.request.login -# -# Define those at will however suits the application. - -def member_of(groupname): - def check(): - # replace with actual check if <username> is in <groupname> - return cherrypy.request.login == 'joe' and groupname == 'admin' - return check - -def name_is(reqd_username): - return lambda: reqd_username == cherrypy.request.login - -# These might be handy - -def any_of(*conditions): - """ - Returns True if any of the conditions match. - """ - def check(): - for c in conditions: - if c(): - return True - return False - return check - -# By default all conditions are required, but this might still be -# needed if you want to use it inside of an any_of(...) condition -def all_of(*conditions): - """ - Returns True if all of the conditions match. - """ - def check(): - for c in conditions: - if not c(): - return False - return True - return check - - -class AuthController(object): - """ - This class provides login and logout actions. - """ - def __init__(self): - self.logger = log.Log() - self.username = "" - - def on_login(self, username): - """ - Called on successful login. - """ - self.username = username - self.logger.info(username + ' logged in.') - - def on_logout(self, username): - """ - Called on logout. - """ - self.logger.info(username + ' logged out.') - self.username = "" - - def get_loginform(self, username, msg="Enter login information", from_page="/"): - """ - Login page. - """ - return """<!DOCTYPE html>\n<html> - <head> - <meta charset="utf-8" /> - <title>pyAggr3g470r</title> - <link rel="stylesheet" href="/css/log.css" /> - </head> - <body> - <div> - <div id="logform"> - <img src="/static/img/tuxrss.png" alt="pyAggr3g470r" /> - <form method="post" action="/auth/login"> - <input type="hidden" name="from_page" value="%(from_page)s" /> - %(msg)s<br /> - <input type="text" name="username" value="%(username)s" placeholder="Username" autofocus="autofocus" /><br /> - <input type="password" name="password" placeholder="Password" /><br /> - <input type="submit" value="Log in" /> - </form> - </div><!-- end #main --> - </div><!-- end #center --> - </body> -</html>""" % locals() - - @cherrypy.expose - def login(self, username=None, password=None, from_page="/"): - """ - Open a session for an authenticated user. - """ - if username is None or password is None: - return self.get_loginform("", from_page=from_page) - - error_msg = check_credentials(username, password) - if error_msg: - self.logger.info(error_msg) - return self.get_loginform(username, error_msg, from_page) - else: - cherrypy.session[SESSION_KEY] = cherrypy.request.login = username - self.on_login(username) - raise cherrypy.HTTPRedirect(from_page or "/") - - @cherrypy.expose - def logout(self, from_page="/"): - """ - Cloase a session. - """ - sess = cherrypy.session - username = sess.get(SESSION_KEY, None) - sess[SESSION_KEY] = None - if username: - cherrypy.request.login = None - self.on_logout(username) - raise cherrypy.HTTPRedirect(from_page or "/")
\ No newline at end of file diff --git a/source/binarytree.py b/source/binarytree.py deleted file mode 100644 index a9294251..00000000 --- a/source/binarytree.py +++ /dev/null @@ -1,177 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -""" -A binary ordered tree implementation. -""" - -class Node(object): - """ - Represents a node. - """ - def __init__(self, data): - """ - Initialization. - """ - self.left = None - self.right = None - self.data = data - -class OrderedBinaryTree(object): - """ - Represents a binary ordered . - """ - def __init__(self, root=None): - """ - Initializes the root member. - """ - self.root = root - - def addNode(self, data): - """ - Creates a new node and returns it. - """ - return Node(data) - - def insert(self, root, data): - """ - Inserts a new data. - """ - if root == None: - # it there isn't any data - # adds it and returns - return self.addNode(data) - else: - # enters into the - if data['article_date'] <= root.data['article_date']: - # if the data is less than the stored one - # goes into the left-sub- - root.left = self.insert(root.left, data) - else: - # processes the right-sub- - root.right = self.insert(root.right, data) - return root - - def lookup(self, root, target): - """ - Looks for a value into the . - """ - if root == None: - return 0 - else: - # if it has found it... - if target == root.data: - return 1 - else: - if target['article_date'] < root.data['article_date']: - # left side - return self.lookup(root.left, target) - else: - # right side - return self.lookup(root.right, target) - - def minValue(self, root): - """ - Goes down into the left - arm and returns the last value. - """ - while(root.left != None): - root = root.left - return root.data - - def maxValue(self, root): - """ - Goes down into the right - arm and returns the last value. - """ - while(root.right != None): - root = root.right - return root.data - - def maxDepth(self, root): - """ - Return the maximum depth. - """ - if root == None: - return 0 - else: - # computes the two depths - ldepth = self.maxDepth(root.left) - rdepth = self.maxDepth(root.right) - # returns the appropriate depth - return max(ldepth, rdepth) + 1 - - def size(self, root): - if root == None: - return 0 - else: - return self.size(root.left) + 1 + self.size(root.right) - - def pre_order_traversal(self, root, result=[]): - """ - Depth-first. Pre-order traversal. - """ - if root == None: - pass - else: - result.append(root.data) - self.pre_order_traversal(root.left, result) - self.pre_order_traversal(root.right, result) - return result - - def in_order_traversal(self, root, result=[]): - """ - Depth-first. In-order traversal. - """ - if root == None: - pass - else: - self.in_order_traversal(root.left, result) - result.append(root.data) - self.in_order_traversal(root.right, result) - return result - - def post_order_traversal(self, root, result=[]): - """ - Depth-first. Post-order traversal. - """ - if root == None: - pass - else: - self.post_order_traversal(root.left, result) - self.post_order_traversal(root.right, result) - result.append(root.data) - return result - - def __str__(self): - """ - Pretty display. - """ - return ", ".join([article["article_title"] for article in \ - self.in_order_traversal(self.root)]) - -if __name__ == "__main__": - # Point of entry in execution mode. - # create the tree - tree = OrderedBinaryTree() - # add the root node - root = tree.addNode(0) - # ask the user to insert values - for i in range(0, 5): - data = int(input("insert the node value nr %d: " % i)) - # insert values - tree.insert(root, data) - - tree.printTree(root) - print() - tree.printRevTree(root) - print() - data = int(input("Insert a value to find: ")) - if tree.lookup(root, data): - print("found") - else: - print("not found") - - print(tree.minValue(root)) - print(tree.maxDepth(root)) - print(tree.size(root))
\ No newline at end of file diff --git a/source/conf.py b/source/conf.py deleted file mode 100644 index 1b262927..00000000 --- a/source/conf.py +++ /dev/null @@ -1,57 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.2 $" -__date__ = "$Date: 2012/04/22 $" -__revision__ = "$Date: 2013/08/15 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - - -import os -import configparser -# load the configuration -config = configparser.SafeConfigParser() -try: - config.read("./cfg/pyAggr3g470r.cfg") -except: - config.read("./cfg/pyAggr3g470r.cfg-sample") -path = os.path.abspath(".") - -MONGODB_ADDRESS = config.get('MongoDB', 'address') -MONGODB_PORT = int(config.get('MongoDB', 'port')) -MONGODB_DBNAME = config.get('MongoDB', 'dbname') -MONGODB_USER = config.get('MongoDB', 'user') -MONGODB_PASSWORD = config.get('MongoDB', 'password') - -HTTP_PROXY = config.get('feedparser', 'http_proxy') -USER_AGENT = config.get('feedparser', 'user_agent') -FEED_LIST = config.get('feedparser', 'feed_list') - -MAIL_ENABLED = bool(int(config.get('mail','enabled'))) -mail_from = config.get('mail','mail_from') -mail_to = config.get('mail','mail_to') -smtp_server = config.get('mail','smtp') -username = config.get('mail','username') -password = config.get('mail','password') - -DIASPORA_POD = config.get('misc', 'diaspora_pod')
\ No newline at end of file diff --git a/source/export.py b/source/export.py deleted file mode 100644 index 5e06aea1..00000000 --- a/source/export.py +++ /dev/null @@ -1,274 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.4 $" -__date__ = "$Date: 2011/10/24 $" -__revision__ = "$Date: 2013/03/05 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -# -# This file contains the export functions of pyAggr3g470r. Indeed -# it is possible to export the database of articles in different formats: -# - simple HTML webzine; -# - text file; -# - ePub file; -# - PDF file. -# - -import os -import time - -import conf -import utils - -def HTML_HEADER(title="pyAggr3g470r", css="./style.css"): - return """<!DOCTYPE html> -<html lang="en-US"> -<head> -<title>%s</title> -<meta charset="utf-8"/> -<link rel="stylesheet" href="%s" /> -</head> -<body>""" % (title, css) - -HTML_FOOTER = """<hr /> -<p>This archive has been generated with -<a href="https://bitbucket.org/cedricbonhomme/pyaggr3g470r/">pyAggr3g470r</a>. -A software under GPLv3 license. -You are welcome to copy, modify or redistribute the source code according to the -<a href="http://www.gnu.org/licenses/gpl-3.0.txt">GPLv3</a> license.</p> -</body> -</html> -""" - -CSS = """body { - font:normal medium 'Gill Sans','Gill Sans MT',Verdana,sans-serif; - margin:1.20em auto; - width:80%; - line-height:1.75; -} -blockquote { - font-size:small; - line-height:2.153846; - margin:2.153846em 0; - padding:0;font-style:oblique; - border-left:1px dotted; - margin-left:2.153846em; - padding-left:2.153846em; -} -blockquote p{ - margin:2.153846em 0; -} -p+br { - display:none; -} -h1 { -font-size:large; -} -h2,h3 { - font-size:medium; -} -hr { - border-style:dotted; - height:1px; - border-width: 1px 0 0 0; - margin:1.45em 0 1.4em; - padding:0; -} -a { - text-decoration:none; - color:#00008B; -} -#footer { - clear:both; - text-align:center; - font-size:small; -} -img { - border:0; -} -.horizontal,.simple li { - margin:0; - padding:0; - list-style:none; - display:inline -} -.simple li:before { - content:"+ "; -} -.simple > li:first-child:before { - content:""; -} -.author { - text-decoration:none; - display:block; - float:right; - margin-left:2em; - font-size:small; -} -.content { - margin:1.00em 1.00em; -}""" - -def export_html(mongo_db): - """ - Export the articles given in parameter in a simple Webzine. - """ - nb_articles = format(mongo_db.nb_articles(), ",d") - feeds = mongo_db.get_all_feeds() - index = HTML_HEADER("News archive") - index += "<h1>List of feeds</h1>\n" - index += """<p>%s articles.</p>\n<ul>\n""" % (nb_articles,) - for feed in feeds: - # creates a folder for each stream - feed_folder = conf.path + "/var/export/webzine/" + \ - utils.normalize_filename(feed["feed_id"]) - try: - os.makedirs(feed_folder) - except OSError: - # directories already exists (not a problem) - pass - - index += """ <li><a href="%s">%s</a></li>\n""" % \ - (feed["feed_id"], feed["feed_title"]) - - posts = HTML_HEADER(feed["feed_title"], "../style.css") - posts += """<h1>Articles of the feed <a href="%s">%s</a></h1>\n""" % (feed["site_link"], feed["feed_title"]) - posts += """<p>%s articles.</p>\n""" % (format(mongo_db.nb_articles(feed["feed_id"]), ",d"),) - - for article in mongo_db.get_articles(feed_id=feed["feed_id"]): - - post_file_name = os.path.normpath(feed_folder + "/" + article["article_id"] + ".html") - feed_index = os.path.normpath(feed_folder + "/index.html") - - posts += article["article_date"].ctime() + " - " + \ - """<a href="./%s.html">%s</a>""" % \ - (article["article_id"], article["article_title"][:150]) + "<br />\n" - - a_post = HTML_HEADER(article["article_title"], "../style.css") - a_post += '<div style="width:60%; overflow:hidden; text-align:justify; margin:0 auto">\n' - a_post += """<h1><a href="%s">%s</a></h1>\n<br />""" % \ - (article["article_link"], article["article_title"]) - a_post += article["article_content"] - a_post += "</div>\n<hr />\n" - a_post += """<br />\n<a href="%s">Complete story</a>\n<br />\n""" % (article["article_link"],) - a_post += HTML_FOOTER - - with open(post_file_name, "w") as f: - f.write(a_post) - - posts += HTML_FOOTER - with open(feed_index, "w") as f: - f.write(posts) - - index += "</ul>\n" - index += "<p>" + time.strftime("Generated on %d %b %Y at %H:%M.") + "</p>\n" - index += HTML_FOOTER - with open(conf.path + "/var/export/webzine/" + "index.html", "w") as f: - f.write(index) - with open(conf.path + "/var/export/webzine/" + "style.css", "w") as f: - f.write(CSS) - -def export_txt(mongo_db): - """ - Export the articles given in parameter in text files. - """ - feeds = mongo_db.get_all_feeds() - for feed in feeds: - # creates folder for each stream - folder = conf.path + "/var/export/txt/" + \ - utils.normalize_filename(feed["feed_title"].strip().replace(':', '').lower()) - try: - os.makedirs(folder) - except OSError: - # directories already exists (not a problem) - pass - - for article in mongo_db.get_articles(feed_id=feed["feed_id"]): - name = article["article_date"].ctime().strip().replace(' ', '_') - name = os.path.normpath(folder + "/" + name + ".txt") - - content = "Title: " + article["article_title"] + "\n\n\n" - content += utils.clear_string(article["article_content"]) - - with open(name, "w") as f: - f.write(content) - -def export_epub(mongo_db): - """ - Export the articles given in parameter in ePub files. - """ - from epub import ez_epub - feeds = mongo_db.get_all_feeds() - for feed in feeds: - # creates folder for each stream - folder = conf.path + "/var/export/epub/" + \ - utils.normalize_filename(feed["feed_title"].strip().replace(':', '').lower().encode('utf-8')) - try: - os.makedirs(folder) - except OSError: - # directories already exists (not a problem) - pass - - for article in mongo_db.get_articles(feed_id=feed["feed_id"]): - name = article["article_date"].ctime().strip().replace(' ', '_') - name = os.path.normpath(folder + "/" + name + ".epub") - - section = ez_epub.Section() - section.title = article["article_title"] - section.paragraphs = [utils.clear_string(article["article_content"])] - ez_epub.makeBook(article["article_title"], [feed["feed_title"]], [section], \ - name, lang='en-US', cover=None) - -def export_pdf(feeds): - """ - Export the articles given in parameter in PDF files. - """ - from xhtml2pdf import pisa - import io as StringIO - for feed in list(feeds.values()): - # creates folder for each stream - folder = utils.path + "/var/export/pdf/" + \ - utils.normalize_filename(feed.feed_title.strip().replace(':', '').lower()) - try: - os.makedirs(folder) - except OSError: - # directories already exists (not a problem) - pass - - for article in list(feed.articles.values()): - name = article.article_date.strip().replace(' ', '_') - name = os.path.normpath(folder + "/" + name + ".pdf") - - content = HTML_HEADER(article.article_title) - content += '\n<div style="width: 50%; overflow:hidden; text-align: justify; margin:0 auto">\n' - content += """<h1><a href="%s">%s</a></h1><br />""" % \ - (article.article_link, article.article_title) - content += article.article_description - content += "</div>\n<hr />\n" - content += HTML_FOOTER - - try: - pdf = pisa.CreatePDF(StringIO.StringIO(content), file(name, "wb")) - except: - pass diff --git a/source/feedgetter.py b/source/feedgetter.py deleted file mode 100755 index ce1cba1b..00000000 --- a/source/feedgetter.py +++ /dev/null @@ -1,231 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.8 $" -__date__ = "$Date: 2010/09/02 $" -__revision__ = "$Date: 2013/08/15 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -import hashlib -import threading -import urllib.request -import feedparser -from bs4 import BeautifulSoup -from datetime import datetime -from contextlib import contextmanager - -import conf -import search -import utils -import mongodb - -import log -pyaggr3g470r_log = log.Log() - -list_of_threads = [] - -@contextmanager -def opened_w_error(filename, mode="r"): - try: - f = open(filename, mode) - except IOError as err: - yield None, err - else: - try: - yield f, None - finally: - f.close() - -class FeedGetter(object): - """ - This class is in charge of retrieving feeds listed in ./var/feed.lst. - This class uses feedparser module from Mark Pilgrim. - For each feed a new thread is launched. - """ - def __init__(self): - """ - Initializes the database connection. - """ - # MongoDB connections - self.articles = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) - if conf.HTTP_PROXY == "": - self.proxy = urllib.request.ProxyHandler({}) - else: - self.proxy = urllib.request.ProxyHandler({"http" : conf.HTTP_PROXY}) - feedparser.USER_AGENT = conf.USER_AGENT - - def retrieve_feed(self, feed_url=None, feed_original=None): - """ - Parse the file 'feeds.lst' and launch a thread for each RSS feed. - """ - if feed_url != None: - self.process(feed_url, feed_original) - else: - with opened_w_error(conf.FEED_LIST) as (f, err): - if err: - pyaggr3g470r_log.error("List of feeds not found.") - else: - for a_feed in f: - # test if the URL is well formed - for url_regexp in utils.url_finders: - if url_regexp.match(a_feed): - the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "") - try: - # launch a new thread for the RSS feed - thread = threading.Thread(None, self.process, \ - None, (the_good_url,)) - thread.start() - list_of_threads.append(thread) - except: - pass - break - - # wait for all threads are done - for th in list_of_threads: - th.join() - - def process(self, the_good_url, feed_original=None): - """Request the URL - - Executed in a thread. - """ - if utils.open_url(the_good_url)[0] == True: - # if ressource is available add the articles in the base. - self.add_into_database(the_good_url, feed_original) - - def add_into_database(self, feed_link, feed_original=None): - """ - Add the articles of the feed 'a_feed' in the database. - """ - a_feed = feedparser.parse(feed_link, handlers = [self.proxy]) - if a_feed['entries'] == []: - return - try: - feed_image = a_feed.feed.image.href - except: - feed_image = "/img/feed-icon-28x28.png" - - if feed_original != None: - feed_link = feed_original - - sha1_hash = hashlib.sha1() - sha1_hash.update(feed_link.encode('utf-8')) - feed_id = sha1_hash.hexdigest() - - feed = self.articles.get_feed(feed_id) - if None == feed: - collection_dic = {"feed_id": feed_id, \ - "type": 0, \ - "feed_image": feed_image, \ - "feed_title": utils.clear_string(a_feed.feed.title), \ - "feed_link": feed_link, \ - "site_link": a_feed.feed.link, \ - "mail": False \ - } - self.articles.add_collection(collection_dic) - feed = self.articles.get_feed(feed_id) - - articles = [] - for article in a_feed['entries']: - description = "" - article_title = "" - try: - # article content - description = article.content[0].value - except AttributeError: - try: - # article description - description = article.description - except Exception: - description = "" - try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception as E: - pyaggr3g470r_log.error("Problem when sanitizing the content of the feed: " + feed_link) - article_title = article.title - - try: - post_date = datetime(*article.published_parsed[:6]) - except: - post_date = datetime(*article.updated_parsed[:6]) - - sha1_hash = hashlib.sha1() - sha1_hash.update(article.link.encode('utf-8')) - article_id = sha1_hash.hexdigest() - - article = {"article_id": article_id, \ - "type":1, \ - "article_date": post_date, \ - "article_link": article.link, \ - "article_title": article_title, \ - "article_content": description, \ - "article_readed": False, \ - "article_like": False \ - } - - articles.append(article) - - if self.articles.get_articles(feed_id, article_id) == []: - # add the article to the Whoosh index - try: - search.add_to_index([article], feed) - except: - print("Whoosh error.") - pyaggr3g470r_log.error("Whoosh error.") - continue - - if conf.MAIL_ENABLED and feed["mail"]: - # if subscribed to the feed - threading.Thread(None, utils.send_mail, None, (conf.mail_from, conf.mail_to, \ - a_feed.feed.title, \ - article_title, description)).start() - self.articles.add_articles(articles, feed_id) - - -if __name__ == "__main__": - # Point of entry in execution mode - feed_getter = FeedGetter() - # Retrieve all feeds - feed_getter.retrieve_feed() - - # If you want to get all articles of a blog: - """ - for i in range(1,86): - feed_original = "http://esr.ibiblio.org/?feed=rss2" - feed = feed_original + "&paged=" + str(i) - print("Retrieving", feed, "...") - feed_getter.retrieve_feed(feed, feed_original) - """ - """ - for i in range(1,5): - feed_original = "http://spaf.wordpress.com/feed/" - feed = feed_original + "?paged=" + str(i) - print("Retrieving", feed, "...") - feed_getter.retrieve_feed(feed, feed_original) - """ - - # For a blogspot blog: - #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default") - #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")
\ No newline at end of file diff --git a/source/log.py b/source/log.py deleted file mode 100755 index 5db5d838..00000000 --- a/source/log.py +++ /dev/null @@ -1,67 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.1 $" -__date__ = "$Date: 2012/10/12 $" -__revision__ = "$Date: 2012/10/12 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -class Log(object): - """ - Log events. Especially events relative to authentication. - """ - def __init__(self): - """ - Initialization of the logger. - """ - import logging - self.logger = logging.getLogger("pyaggr3g470r") - hdlr = logging.FileHandler('./var/pyaggr3g470r.log') - formater = logging.Formatter('%(asctime)s %(levelname)s %(message)s') - hdlr.setFormatter(formater) - self.logger.addHandler(hdlr) - self.logger.setLevel(logging.INFO) - - def info(self, message): - """ - Log notices. - """ - self.logger.info(message) - - def warning(self, message): - """ - Log warnings. - """ - self.logger.warning(message) - - def error(self, message): - """ - Log errors. - """ - self.logger.warning(message) - - def critical(self, message): - """ - Log critical errors. - """ - self.logger.critical(message) diff --git a/source/mongodb.py b/source/mongodb.py deleted file mode 100644 index 04cd44fa..00000000 --- a/source/mongodb.py +++ /dev/null @@ -1,283 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.8 $" -__date__ = "$Date: 2012/03/03 $" -__revision__ = "$Date: 2013/06/25 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -import pymongo - -class Articles(object): - """ - This class is responsible of the management of the MongoDB - database. - """ - def __init__(self, url='localhost', port=27017, db_name="pyaggr3g470r", user="", password=""): - """ - Instantiates the connection. - """ - self.db_name = db_name - self.connection = pymongo.connection.Connection(url, port) - self.db = pymongo.database.Database(self.connection, self.db_name) - if password != "": - self.db.authenticate(user, password) - collections = self.db.collection_names() - for collection_name in collections: - if collection_name != "system.indexes": - self.db[collection_name].ensure_index([("article_date", pymongo.DESCENDING)], \ - name="date_index", unique=False, \ - background=True) - - def add_collection(self, new_collection): - """ - Creates a new collection for a new feed. - """ - collection = self.db[new_collection["feed_id"]] - collection.insert(new_collection) - - def add_articles(self, articles, feed_id): - """ - Add article(s) in a collection. - """ - collection = self.db[str(feed_id)] - for article in articles: - cursor = collection.find({"article_id":article["article_id"]}) - if cursor.count() == 0: - collection.insert(article) - - def delete_feed(self, feed_id): - """ - Delete a collection (feed with all articles). - """ - self.db.drop_collection(feed_id) - - def delete_article(self, feed_id, article_id): - """ - Delete an article. - """ - collection = self.db[str(feed_id)] - collection.remove(spec_or_id={"article_id":article_id}, safe=True) - - def get_feed(self, feed_id): - """ - Return information about a feed (collection). - Return None if the collection does not exist. - """ - try: - return next(self.db[str(feed_id)].find()) - except: - return None - - def get_all_feeds(self, condition=None): - """ - Return all feeds object. The returned list - is sorted by alphabetically (by feed name). - """ - feeds = [] - collections = self.db.collection_names() - for collection_name in collections: - if collection_name != "system.indexes": - if condition is None: - cursor = self.db[collection_name].find({"type":0}) - else: - cursor = self.db[collection_name].find({"type":0, condition[0]:condition[1]}) - if cursor.count() != 0: - feeds.append(next(cursor)) - feeds.sort(key = lambda elem: elem['feed_title'].lower()) - return feeds - - def get_articles(self, feed_id=None, article_id=None, condition=None, limit=1000000000): - """ - Return one or several articles. - The parameter "condition" is an optional requirement, for example: - get_articles(feed_id, condition=("article_readed", False)) will - return all unread articles of the feed 'feed_id'. - """ - if feed_id == None and article_id == None: - # Return all articles. - articles = [] - collections = self.db.collection_names() - for collection_name in collections: - collection = self.db[collection_name] - if condition is None: - articles.extend(collection.find({"type":1}, limit=limit)) - else: - articles.extend(collection.find({"type":1, condition[0]:condition[1]}, limit=limit)) - return articles - - elif feed_id != None and article_id == None: - # Return all the articles of a collection. - collection = self.db[str(feed_id)] - if condition is None: - cursor = collection.find({"type":1}, limit=limit) - else: - cursor = collection.find({"type":1, condition[0]:condition[1]}, limit=limit) - return cursor.sort([("article_date", pymongo.DESCENDING)]) - - elif feed_id != None and article_id != None: - # Return a precise article. - collection = self.db[str(feed_id)] - try: - return next(collection.find({"article_id":article_id})) - except: - return [] - - def get_favorites(self, feed_id=None): - """ - Return favorites articles. - """ - if feed_id is not None: - # only for a feed - collection = self.db[feed_id] - cursor = collection.find({'type':1, 'article_like':True}) - return cursor.sort([("article_date", pymongo.DESCENDING)]) - else: - favorites = [] - for feed_id in self.db.collection_names(): - favorites += self.get_favorites(feed_id) - return favorites - - def nb_articles(self, feed_id=None): - """ - Return the number of articles of a feed - or of all the database. - """ - if feed_id is not None: - collection = self.db[feed_id] - cursor = collection.find({'type':1}) - return cursor.count() - else: - nb_articles = 0 - for feed_id in self.db.collection_names(): - nb_articles += self.nb_articles(feed_id) - return nb_articles - - def nb_unread_articles(self, feed_id=None): - """ - Return the number of unread articles of a feed - or of all the database. - """ - if feed_id is not None: - return self.get_articles(feed_id=feed_id, condition=("article_readed", False)).count() - else: - return len(self.get_articles(condition=("article_readed", False))) - - def like_article(self, like, feed_id, article_id): - """ - Like or unlike an article. - """ - collection = self.db[str(feed_id)] - collection.update({"article_id": article_id}, {"$set": {"article_like": like}}) - - def nb_favorites(self, feed_id=None): - """ - Return the number of favorites articles of a feed - or of all the database. - """ - if feed_id is not None: - return self.get_favorites(feed_id).count() - else: - return len(self.get_favorites()) - - def nb_mail_notifications(self): - """ - Return the number of subscribed feeds. - """ - nb_mail_notifications = 0 - for feed_id in self.db.collection_names(): - collection = self.db[feed_id] - cursor = collection.find({'type':0, 'mail':True}) - nb_mail_notifications += cursor.count() - return nb_mail_notifications - - def mark_as_read(self, readed, feed_id=None, article_id=None): - """ - Mark one or several articles as read. - """ - if feed_id != None and article_id != None: - collection = self.db[str(feed_id)] - collection.update({"article_id": article_id, "article_readed":not readed}, {"$set": {"article_readed": readed}}) - elif feed_id != None and article_id == None: - collection = self.db[str(feed_id)] - collection.update({"type": 1, "article_readed":not readed}, {"$set": {"article_readed": readed}}, multi=True) - else: - for feed_id in self.db.collection_names(): - self.mark_as_read(readed, feed_id, None) - - def update_feed(self, feed_id, changes): - """ - Update a feed. - """ - collection = self.db[str(feed_id)] - collection.update({"type": 0, "feed_id":feed_id}, {"$set": changes}, multi=True) - if "feed_id" in changes.keys(): - self.db[str(feed_id)].rename(str(changes["feed_id"])) - - # Functions on database - def drop_database(self): - """ - Drop all the database - """ - self.connection.drop_database(self.db_name) - - -if __name__ == "__main__": - # Point of entry in execution mode. - articles = Articles() - # Create a collection for a stream - collection_dic = {"collection_id": 42,\ - "feed_image": "Image", \ - "feed_title": "Title", \ - "feed_link": "Link", \ - "site_title": "Site link", \ - "mail": True, \ - } - #articles.add_collection(collection_dic) - - # Add an article in the newly created collection - article_dic1 = {"article_id": 51, \ - "article_date": "Today", \ - "article_link": "Link of the article", \ - "article_title": "The title", \ - "article_content": "The content of the article", \ - "article_readed": True, \ - "article_like": True \ - } - article_dic2 = {"article_id": 52, \ - "article_date": "Yesterday", \ - "article_link": "Link", \ - "article_title": "Hello", \ - "article_content": "The content of the article", \ - "article_readed": True, \ - "article_like": True \ - } - - #articles.add_articles([article_dic1, article_dic2], 42) - - print("All articles:") - #print articles.get_all_articles() - - - # Drop the database - #articles.drop_database() diff --git a/source/pyAggr3g470r b/source/pyAggr3g470r deleted file mode 100755 index 3755ad16..00000000 --- a/source/pyAggr3g470r +++ /dev/null @@ -1,143 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2012 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.2 $" -__date__ = "$Date: 2011/06/20 $" -__date__ = "$Date: 2013/02/17 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -# This control file is inspired from Forban: http://www.foo.be/forban. - -import os -import sys -import time -import subprocess -import platform -import signal - -PATH = os.path.abspath(".") -SERVICE = "pyAggr3g470r" - -def service_start(python_command, servicename=None): - """ - Starts a new service with Popen and returns the processus id. - """ - if servicename is not None: - service = servicename + ".py" - proc = subprocess.Popen([python_command, "-tt", service], stderr=subprocess.STDOUT, stdout=subprocess.PIPE) - time.sleep(0.15) - return proc.pid - return False - -def writepid(processname=None, pid=None): - """ - Writes the pid of processname in a file. - """ - pidpath = os.path.join(PATH, "var", processname + ".pid") - if processname is not None and pid is not None: - with open(pidpath, "w") as f: - f.write(str(pid)) - return True - return False - -def checkpid(servicename=None): - pidpath = os.path.join(PATH,"var", servicename + ".pid") - if os.path.exists(pidpath): - return True - else: - return False - -def pidof(processname=None): - pidpath = os.path.join(PATH, "var", processname + ".pid") - if processname is not None and os.path.exists(pidpath): - with open(pidpath) as f: - pid = f.read() - return pid - return False - -def rmpid(processname=None): - """ - Deletes the file which contains the PID. - """ - pidpath = os.path.join(PATH, "var", processname + ".pid") - if os.path.exists(pidpath): - os.unlink(pidpath) - return True - else: - return False - -def start(python_command): - if not checkpid(servicename=SERVICE): - pid = service_start(python_command, servicename =SERVICE) - writepid(processname=SERVICE, pid=pid) - print(SERVICE + " is starting with pid: " + pidof(processname=SERVICE)) - else: - print(SERVICE + " could not be started (pid exists)") - retval = False - -def stop(): - """ - Stop the process SERVICE. - """ - print("Stopping " + SERVICE + "...") - retval = True - pid = pidof(processname=SERVICE) - if pid: - if platform.system() == "Windows": - import win32api - import win32con - phandle = win32api.OpenProcess(win32con.PROCESS_TERMINATE, 0, int(pid)) - win32api.TerminateProcess(phandle, 0) - win32api.CloseHandle(phandle) - rmpid(processname=SERVICE) - else: - try: - os.kill(int(pid), signal.SIGKILL) - except OSError as e: - print(SERVICE + " unsuccessfully stopped") - retval = False - finally: - rmpid(processname=SERVICE) - return retval - -def usage(): - print("pyAggr3g470r (start|stop|restart)") - exit (1) - -if __name__ == "__main__": - # Point of entry in execution mode. - python_command = "python" - if sys.version_info.major == 2: - # Ensures that code doesn't illegally mixed tabs and spaces - python_command = "python3.3" - if len(sys.argv) == 1: - usage() - elif sys.argv[1] == "start": - start(python_command) - elif sys.argv[1] == "stop": - stop() - elif sys.argv[1] == "restart": - stop() - start(python_command) - else: - usage() diff --git a/source/pyAggr3g470r.py b/source/pyAggr3g470r.py deleted file mode 100755 index 922e7114..00000000 --- a/source/pyAggr3g470r.py +++ /dev/null @@ -1,715 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 4.1 $" -__date__ = "$Date: 2010/01/29 $" -__revision__ = "$Date: 2013/09/09 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -# -# This file contains the "Root" class which describes -# all pages (views) of pyAggr3g470r. These pages are: -# - main page; -# - management; -# - history; -# - favorites; -# - notifications; -# - unread; -# - feed summary; -# - inactives; -# - languages. -# Templates are described in ./templates with the Mako -# template library. -# - -import os -import re -import time -import datetime - -from collections import defaultdict -from whoosh.index import EmptyIndexError - -import cherrypy -from mako.template import Template -from mako.lookup import TemplateLookup -lookup = TemplateLookup(directories=['templates']) - -import conf -import utils -import export -import mongodb -import search -import feedgetter -import auth - -def error_404(status, message, traceback, version): - """ - Display an error if the page does not exist. - """ - message = "<p>Error %s - This page does not exist.</p>" % status - tmpl = lookup.get_template("error.html") - return tmpl.render(message=message) - -def handle_error(): - """ - Handle different type of errors. - """ - message = "<p>Sorry, an error occured.</p>" - cherrypy.response.status = 500 - cherrypy.response.body = [message] - -class RestrictedArea(object): - """ - All methods in this controller (and subcontrollers) is - open only to members of the admin group - """ - _cp_config = { - 'auth.auth.require': [auth.member_of('admin')] - } - - @cherrypy.expose - def index(self): - message = "<p>This is the admin only area.</p>" - tmpl = lookup.get_template("error.html") - return tmpl.render(message=message) - -class pyAggr3g470r(object): - """ - Main class. - All pages of pyAggr3g470r are described in this class. - """ - _cp_config = {'request.error_response': handle_error, \ - 'tools.sessions.on': True, \ - 'tools.auth.on': True} - - def __init__(self): - """ - """ - self.auth = auth.AuthController() - restricted = RestrictedArea() - - self.mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) - @auth.require() - def index(self): - """ - Main page containing the list of feeds and articles. - """ - feeds = self.mongo.get_all_feeds() - nb_unread_articles = self.mongo.nb_unread_articles() - nb_favorites = self.mongo.nb_favorites() - nb_mail_notifications = self.mongo.nb_mail_notifications() - tmpl = lookup.get_template("index.html") - return tmpl.render(feeds=feeds, nb_feeds=len(feeds), mongo=self.mongo, \ - nb_favorites=nb_favorites, nb_unread_articles=nb_unread_articles, \ - nb_mail_notifications=nb_mail_notifications, header_text=nb_unread_articles) - - index.exposed = True - - @auth.require() - def management(self): - """ - Management page. - Allows adding and deleting feeds. Export functions of the MongoDB data base - and display some statistics. - """ - feeds = self.mongo.get_all_feeds() - nb_mail_notifications = self.mongo.nb_mail_notifications() - nb_favorites = self.mongo.nb_favorites() - nb_articles = format(self.mongo.nb_articles(), ",d") - nb_unread_articles = format(self.mongo.nb_unread_articles(), ",d") - nb_indexed_documents = format(search.nb_documents(), ",d") - tmpl = lookup.get_template("management.html") - return tmpl.render(feeds=feeds, nb_mail_notifications=nb_mail_notifications, \ - nb_favorites=nb_favorites, nb_articles=nb_articles, \ - nb_unread_articles=nb_unread_articles, \ - mail_notification_enabled=conf.MAIL_ENABLED, \ - nb_indexed_documents=nb_indexed_documents) - - management.exposed = True - - @auth.require() - def statistics(self, word_size=6): - """ - More advanced statistics. - """ - articles = self.mongo.get_articles() - top_words = utils.top_words(articles, n=50, size=int(word_size)) - tag_cloud = utils.tag_cloud(top_words) - tmpl = lookup.get_template("statistics.html") - return tmpl.render(articles=articles, word_size=word_size, tag_cloud=tag_cloud) - - statistics.exposed = True - - @auth.require() - def search(self, query=None): - """ - Simply search for the string 'query' - in the description of the article. - """ - param, _, value = query.partition(':') - feed_id = None - if param == "Feed": - feed_id, _, query = value.partition(':') - search_result = defaultdict(list) - try: - results = search.search(param) - except EmptyIndexError as e: - return self.error('<p>The database has not been <a href="/index_base">indexed</a>.</p>') - for result in results: - article = self.mongo.get_articles(result[0], result[1]) - if article != []: - search_result[result[0]].append(article) - sorted_search_result = {feed_id: sorted(articles, key=lambda t: t['article_date'], reverse=True) \ - for feed_id, articles in search_result.items()} - tmpl = lookup.get_template("search.html") - return tmpl.render(search_result=sorted_search_result, query=query, feed_id=feed_id, mongo=self.mongo) - - search.exposed = True - - @auth.require() - def fetch(self, param=None): - """ - Fetch all feeds. - """ - feed_link = None - if None != param: - # Fetch only the feed specified in parameter - feed_link = self.mongo.get_feed(param)["feed_link"] - feed_getter = feedgetter.FeedGetter() - feed_getter.retrieve_feed(feed_url=feed_link) - return self.index() - - fetch.exposed = True - - @auth.require() - def article(self, param, plain_text=0): - """ - Display the article in parameter in a new Web page. - """ - try: - feed_id, article_id = param.split(':') - article = self.mongo.get_articles(feed_id, article_id) - if article == []: - return self.error("<p>This article do not exists.</p>") - feed = self.mongo.get_feed(feed_id) - articles = self.mongo.get_articles(feed_id) - except: - return self.error("<p>Bad URL. This article do not exists.</p>") - - if article["article_readed"] == False: - # if the current article is not yet readed, update the database - self.mark_as_read("Article:"+article["article_id"]+":"+feed["feed_id"]) - - # Description (full content) of the article - if plain_text == "1": - description = "<p>" + utils.clear_string(article["article_content"]) + "</p>" - else: - description = article["article_content"] - if description == "": - description = "<p>No description available.</p>" - - # Generation of the QR Code for the current article - utils.generate_qr_code(article) - - # Previous and following articles - previous, following = None, None - liste = self.mongo.get_articles(feed_id) - for current_article in self.mongo.get_articles(feed_id): - next(articles) - if current_article["article_id"] == article_id: - break - following = current_article - if following is None: - following = liste[liste.count()-1] - try: - previous = next(articles) - except StopIteration: - previous = liste[0] - - tmpl = lookup.get_template("article.html") - return tmpl.render(header_text=article["article_title"], article=article, previous=previous, following=following, \ - diaspora=conf.DIASPORA_POD, feed=feed, description=description, plain_text=plain_text) - - article.exposed = True - - @auth.require() - def feed(self, feed_id, word_size=6): - """ - This page gives summary informations about a feed (number of articles, - unread articles, average activity, tag cloud, e-mail notification and - favourite articles for the current feed. - """ - try: - feed = self.mongo.get_feed(feed_id) - if feed == None: - return self.error("<p>This feed do not exists.</p>") - articles = self.mongo.get_articles(feed_id, limit=10) - nb_articles_feed = self.mongo.nb_articles(feed_id) - nb_articles_total = self.mongo.nb_articles() - nb_unread_articles_feed = self.mongo.nb_unread_articles(feed_id) - favorites = self.mongo.get_favorites(feed_id) - nb_favorites = self.mongo.nb_favorites(feed_id) - except KeyError: - return self.error("<p>This feed do not exists.</p>") - - - if articles.count() != 0: - today = datetime.datetime.now() - last_article = articles[0]["article_date"] - first_article = articles[self.mongo.nb_articles(feed_id)-2]["article_date"] - delta = last_article - first_article - elapsed = today - last_article - average = round(nb_articles_feed / abs(delta.days), 2) - - top_words = utils.top_words(articles = self.mongo.get_articles(feed_id), n=50, size=int(word_size)) - tag_cloud = utils.tag_cloud(top_words) - - tmpl = lookup.get_template("feed.html") - return tmpl.render(feed=feed, articles=articles, favorites=favorites, \ - nb_articles_feed=nb_articles_feed, nb_articles_total=nb_articles_total, nb_unread_articles_feed=nb_unread_articles_feed, \ - nb_favorites = nb_favorites, first_post_date=first_article, end_post_date=last_article, \ - average=average, delta=delta, elapsed=elapsed, \ - tag_cloud=tag_cloud, word_size=word_size, \ - mail_to=conf.mail_to, mail_notification_enabled=conf.MAIL_ENABLED) - - tmpl = lookup.get_template("feed.html") - return tmpl.render(feed=feed, articles=[]) - - feed.exposed = True - - @auth.require() - def articles(self, feed_id): - """ - This page displays all articles of a feed. - """ - try: - feed = self.mongo.get_feed(feed_id) - articles = self.mongo.get_articles(feed_id) - except KeyError: - return self.error("<p>This feed do not exists.</p>") - tmpl = lookup.get_template("articles.html") - return tmpl.render(articles=articles, feed=feed) - - articles.exposed = True - - @auth.require() - def unread(self, feed_id=""): - """ - This page displays all unread articles of a feed. - """ - feeds = self.mongo.get_all_feeds() - tmpl = lookup.get_template("unread.html") - return tmpl.render(feeds=feeds, feed_id=feed_id, mongo=self.mongo) - unread.exposed = True - - @auth.require() - def history(self, query="all", m=""): - """ - This page enables to browse articles chronologically. - """ - feeds = self.mongo.get_all_feeds() - tmpl = lookup.get_template("history.html") - return tmpl.render(feeds=feeds, mongo=self.mongo, query=query, m=m) - - history.exposed = True - - @auth.require() - def error(self, message): - """ - Display a message (bad feed id, bad article id, etc.) - """ - tmpl = lookup.get_template("error.html") - return tmpl.render(message=message) - - error.exposed = True - - @auth.require() - def mark_as_read(self, target=""): - """ - Mark one (or more) article(s) as read by setting the value of the field - 'article_readed' of the MongoDB database to 'True'. - """ - param, _, identifiant = target.partition(':') - - # Mark all articles as read. - if param == "": - self.mongo.mark_as_read(True, None, None) - # Mark all articles from a feed as read. - elif param == "Feed" or param == "Feed_FromMainPage": - self.mongo.mark_as_read(True, identifiant, None) - # Mark an article as read. - elif param == "Article": - self.mongo.mark_as_read(True, identifiant.split(':')[1], identifiant.split(':')[0]) - return self.index() - - mark_as_read.exposed = True - - @auth.require() - def notifications(self): - """ - List all active e-mail notifications. - """ - feeds = self.mongo.get_all_feeds(condition=("mail",True)) - tmpl = lookup.get_template("notifications.html") - return tmpl.render(feeds=feeds, mail_to=conf.mail_to, mail_notification_enabled=conf.MAIL_ENABLED) - - notifications.exposed = True - - @auth.require() - def mail_notification(self, param): - """ - Enable or disable to notifications of news for a feed. - """ - try: - action, feed_id = param.split(':') - new_value = 1 == int(action) - self.mongo.update_feed(feed_id, {"mail":new_value}) - except: - return self.error("<p>Bad URL. This feed do not exists.</p>") - return self.index() - - mail_notification.exposed = True - - @auth.require() - def like(self, param): - """ - Mark or unmark an article as favorites. - """ - try: - like, feed_id, article_id = param.split(':') - articles = self.mongo.get_articles(feed_id, article_id) - except: - return self.error("<p>Bad URL. This article do not exists.</p>") - self.mongo.like_article("1"==like, feed_id, article_id) - return self.article(feed_id+":"+article_id) - - like.exposed = True - - @auth.require() - def subscriptions(self): - """ - List all active e-mail notifications. - """ - feeds = self.mongo.get_all_feeds() - tmpl = lookup.get_template("subscriptions.html") - return tmpl.render(feeds=feeds) - - subscriptions.exposed = True - - @auth.require() - def favorites(self): - """ - List of favorites articles - """ - feeds = self.mongo.get_all_feeds() - articles = {} - for feed in feeds: - articles[feed["feed_id"]] = self.mongo.get_favorites(feed["feed_id"]) - tmpl = lookup.get_template("favorites.html") - return tmpl.render(feeds=feeds, \ - articles=articles) - - favorites.exposed = True - - @auth.require() - def inactives(self, nb_days=365): - """ - List of favorites articles - """ - feeds = self.mongo.get_all_feeds() - today = datetime.datetime.now() - inactives = [] - for feed in feeds: - more_recent_article = self.mongo.get_articles(feed["feed_id"], limit=1) - if more_recent_article.count() == 0: - last_post = datetime.datetime.fromtimestamp(time.mktime(time.gmtime(0))) - else: - last_post = next(more_recent_article)["article_date"] - elapsed = today - last_post - if elapsed > datetime.timedelta(days=int(nb_days)): - inactives.append((feed, elapsed)) - tmpl = lookup.get_template("inactives.html") - return tmpl.render(inactives=inactives, nb_days=int(nb_days)) - - inactives.exposed = True - - @auth.require() - def languages(self): - """ - Filter by languages. - """ - try: - from guess_language import guess_language_name - except: - tmpl = lookup.get_template("error.html") - return tmpl.render(message='<p>Module <i><a href="https://bitbucket.org/spirit/guess_language/">guess_language</a></i> not installed.</p>') - result = {} - feeds = self.mongo.get_all_feeds() - for feed in feeds: - for article in self.mongo.get_articles(feed["feed_id"]): - language = guess_language_name(utils.clear_string(article["article_content"])) - result.setdefault(language, defaultdict(list)) - result[language][feed["feed_id"]].append(article) - tmpl = lookup.get_template("languages.html") - return tmpl.render(articles_sorted_by_languages=result, mongo=self.mongo) - - languages.exposed = True - - @auth.require() - def add_feed(self, url): - """ - Add a new feed with the URL of a page. - """ - # search the feed in the HTML page with BeautifulSoup - feed_url = utils.search_feed(url) - if feed_url is None: - return self.error("<p>Impossible to find a feed at this URL.</p>") - # if a feed exists - else: - result = utils.add_feed(feed_url) - # if the feed is not in the file feed.lst - import hashlib - sha1_hash = hashlib.sha1() - sha1_hash.update(feed_url.encode('utf-8')) - feed_id = sha1_hash.hexdigest() - if result is False: - message = """<p>You are already following <a href="/feed/%s">this feed</a>!</p>""" % (feed_id,) - else: - message = """<p><a href="/feed/%s">Feed added</a>. You can now <a href="/fetch/">fetch your feeds</a>.</p>""" % (feed_id,) - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message=message) - - add_feed.exposed = True - - @auth.require() - def remove_feed(self, feed_id): - """ - Remove a feed from the file feed.lst and from the MongoDB database. - """ - feed = self.mongo.get_feed(feed_id) - self.mongo.delete_feed(feed_id) - utils.remove_feed(feed["feed_link"]) - message = """<p>All articles from the feed <i>%s</i> are now removed from the base.</p>""" % (feed["feed_title"],) - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message=message) - - remove_feed.exposed = True - - @auth.require() - def change_site_url(self, feed_id, old_site_url, new_site_url): - """ - Enables to change the URL of a site present in the database. - """ - try: - self.mongo.update_feed(feed_id, {"site_link":new_site_url}) - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>The URL of the site has been changed.</p>") - except: - return self.error("<p>Error when changing the URL of the site.</p>") - - change_site_url.exposed = True - - @auth.require() - def change_feed_url(self, feed_id, old_feed_url, new_feed_url): - """ - Enables to change the URL of a feed already present in the database. - """ - import hashlib - sha1_hash = hashlib.sha1() - sha1_hash.update(new_feed_url.encode('utf-8')) - new_feed_id = sha1_hash.hexdigest() - self.mongo.update_feed(feed_id, {"feed_id":new_feed_id, - "feed_link":new_feed_url}) - result = utils.change_feed_url(old_feed_url, new_feed_url) - if result: - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>The URL of the feed has been changed.</p>") - else: - return self.error("<p>Error when changing the URL of the feed.</p>") - - change_feed_url.exposed = True - - @auth.require() - def change_feed_name(self, feed_id, new_feed_name): - """ - Enables to change the name of a feed. - """ - try: - self.mongo.update_feed(feed_id, {"feed_title":new_feed_name}) - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>The name of the feed has been changed.</p>") - except: - return self.error("<p>Error when changing the name of the feed.</p>") - - change_feed_name.exposed = True - - @auth.require() - def change_feed_logo(self, feed_id, new_feed_logo): - """ - Enables to change the name of a feed. - """ - try: - self.mongo.update_feed(feed_id, {"feed_image":new_feed_logo}) - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>The logo of the feed has been changed.</p>") - except: - return self.error("<p>Error when changing the logo of the feed.</p>") - - change_feed_logo.exposed = True - - @auth.require() - def change_username(self, new_username): - """ - Enables to change the username of a user. - """ - result = auth.change_username(self.auth.username, new_username) - if result: - self.auth.username = new_username - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>Your username has been changed.</p>") - else: - return self.error("<p>Impossible to change the username.</p>") - - change_username.exposed = True - - @auth.require() - def change_password(self, new_password): - """ - Enables to change the password of a user. - """ - result = auth.change_password(self.auth.username, new_password) - if result: - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>Your password has been changed.</p>") - else: - return self.error("<p>Impossible to change the password.</p>") - - change_password.exposed = True - - @auth.require() - def delete_article(self, param): - """ - Delete an article. - """ - try: - feed_id, article_id = param.split(':') - # Delete from the MonfoDB database - self.mongo.delete_article(feed_id, article_id) - # Delete from the Whoosh index - search.delete_article(feed_id, article_id) - except: - return self.error("<p>Bad URL. This article do not exists.</p>") - - return self.index() - - delete_article.exposed = True - - @auth.require() - def logout(self): - """ - Close the session. - """ - return self.auth.logout() - - logout.exposed = True - - @auth.require() - def drop_base(self): - """ - Delete all articles. - """ - self.mongo.drop_database() - return self.index() - - drop_base.exposed = True - - @auth.require() - def index_base(self): - """ - Launches the indexing of the database. - """ - search.create_index() - return self.index() - - index_base.exposed = True - - @auth.require() - def export(self, export_method): - """ - Export articles currently loaded from the MongoDB database with - the appropriate function of the 'export' module. - """ - getattr(export, export_method)(self.mongo) - try: - getattr(export, export_method)(self.mongo) - except Exception as e: - return self.error(e) - tmpl = lookup.get_template("confirmation.html") - return tmpl.render(message="<p>Export successfully terminated.<br />Check the folder: <b>" + conf.path + "/var/export/</b>.</p>") - - export.exposed = True - - @auth.require() - def epub(self, param): - """ - Export an article to EPUB. - """ - try: - from epub import ez_epub - except Exception as e: - return self.error(e) - try: - feed_id, article_id = param.split(':') - except: - return self.error("Bad URL.") - try: - feed_id, article_id = param.split(':') - feed = self.mongo.get_feed(feed_id) - articles = self.mongo.get_articles(feed_id) - article = self.mongo.get_articles(feed_id, article_id) - except: - self.error("<p>This article do not exists.</p>") - try: - folder = conf.path + "/var/export/epub/" - os.makedirs(folder) - except OSError: - # directories already exists (not a problem) - pass - section = ez_epub.Section() - section.title = article["article_title"] - section.paragraphs = [utils.clear_string(article["article_content"])] - ez_epub.makeBook(article["article_title"], [feed["feed_title"]], [section], \ - os.path.normpath(folder) + "article.epub", lang='en-US', cover=None) - return self.article(param) - - epub.exposed = True - - -if __name__ == '__main__': - # Point of entry in execution mode - root = pyAggr3g470r() - root.favicon_ico = cherrypy.tools.staticfile.handler(filename=os.path.join(conf.path + "/static/img/favicon.png")) - cherrypy.config.update({'error_page.404': error_404}) - cherrypy.quickstart(root, "/" ,config=conf.path + "/cfg/cherrypy.cfg") diff --git a/source/search.py b/source/search.py deleted file mode 100644 index a9248a09..00000000 --- a/source/search.py +++ /dev/null @@ -1,129 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.2 $" -__date__ = "$Date: 2013/06/24 $" -__revision__ = "$Date: 2013/06/25 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -import os - -from whoosh.index import create_in, open_dir -from whoosh.index import EmptyIndexError -from whoosh.fields import * -from whoosh.query import * -from whoosh.qparser import QueryParser -from whoosh.writing import AsyncWriter - -import conf -import utils -import mongodb - -indexdir = "./var/indexdir" - -schema = Schema(title=TEXT(stored=True), \ - content=TEXT, \ - article_id=TEXT(stored=True), \ - feed_id=TEXT(stored=True)) - -def create_index(): - """ - Creates the index. - """ - mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) - feeds = mongo.get_all_feeds() - if not os.path.exists(indexdir): - os.mkdir(indexdir) - ix = create_in(indexdir, schema) - writer = ix.writer() - for feed in feeds: - for article in mongo.get_articles(feed["feed_id"]): - writer.add_document(title=article["article_title"], \ - content=utils.clear_string(article["article_content"]), \ - article_id=article["article_id"] , \ - feed_id=feed["feed_id"]) - writer.commit() - -def add_to_index(articles, feed): - """ - Add a list of articles to the index. - Here an AsyncWriter is used because the function will - be called in multiple threads by the feedgetter module. - """ - try: - ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: - raise EmptyIndexError - writer = AsyncWriter(ix) - for article in articles: - writer.add_document(title=article["article_title"], \ - content=utils.clear_string(article["article_content"]), \ - article_id=article["article_id"] , \ - feed_id=feed["feed_id"]) - writer.commit() - -def delete_article(feed_id, article_id): - """ - Delete an article from the index. - """ - try: - ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: - raise EmptyIndexError - writer = ix.writer() - document = And([Term("feed_id", feed_id), Term("article_id", article_id)]) - writer.delete_by_query(document) - writer.commit() - -def search(term): - """ - Search for `term` in the index. - Returns a list of articles. - """ - try: - ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: - raise EmptyIndexError - with ix.searcher() as searcher: - query = QueryParser("content", ix.schema).parse(term) - results = searcher.search(query, limit=None) - return [(article["feed_id"], article["article_id"]) for article in results] - -def nb_documents(): - """ - Return the number of undeleted documents. - """ - try: - ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: - raise EmptyIndexError - return ix.doc_count() - -if __name__ == "__main__": - # Point of entry in execution mode. - #create_index() - print(nb_documents()) - results = search("Nothomb") - for article in results: - print(article) diff --git a/source/testbinarytree.py b/source/testbinarytree.py deleted file mode 100644 index 84670ca1..00000000 --- a/source/testbinarytree.py +++ /dev/null @@ -1,45 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- - -import time -import sys -import resource -# Increases Python's recursion limit and the size of the stack. -resource.setrlimit(resource.RLIMIT_STACK, (2**29,-1)) -sys.setrecursionlimit(10**6) - -import mongodb -import binarytree -import conf - -print("Loading articles from the database...") -database = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, \ - conf.MONGODB_PASSWORD) -begin = time.time() -articles = database.get_articles() -end = time.time() -print(("{} articles loaded in {} seconds.".format(len(articles), end-begin))) - -print("Generating the binary tree...") -begin = time.time() -root = binarytree.Node(articles[0]) -tree = binarytree.OrderedBinaryTree(root) -# add the root node (first article of the list) -#root = tree.addNode(articles[0]) -for article in articles[1:]: - tree.insert(tree.root, article) -end = time.time() -print(("Generation done in {0:2f} seconds.".format(end-begin))) - -print("Maximum depth of the tree:") -print(tree.maxDepth(tree.root)) -print("Oldest article:") -oldest_article = tree.minValue(tree.root) -print((oldest_article["article_date"].strftime('%Y-%m-%d %H:%M') + \ - " - " + oldest_article["article_title"])) -print("Newest article:") -newest_article = tree.maxValue(tree.root) -print((newest_article["article_date"].strftime('%Y-%m-%d %H:%M') + \ - " - " + newest_article["article_title"])) -#print(tree)
\ No newline at end of file diff --git a/source/utils.py b/source/utils.py deleted file mode 100755 index d39e402f..00000000 --- a/source/utils.py +++ /dev/null @@ -1,317 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.5 $" -__date__ = "$Date: 2010/12/07 $" -__revision__ = "$Date: 2013/07/24 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -# -# This file provides functions used for: -# - the database management; -# - generation of tags cloud; -# - HTML processing; -# - e-mail notifications. -# - -import os -import re -import glob -import operator -import calendar -import html.entities - -try: - from qrcode.pyqrnative.PyQRNative import QRCode, QRErrorCorrectLevel, CodeOverflowException - from qrcode import qr -except: - pass - -import smtplib -from email.mime.multipart import MIMEMultipart -from email.mime.text import MIMEText - -import urllib.request, urllib.error, urllib.parse -import http.server -from bs4 import BeautifulSoup - -from collections import Counter -from contextlib import contextmanager - -import conf - -# regular expression to check URL -url_finders = [ \ - re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \ - re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \ - re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \ - re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+") \ -] - -import log -pyaggr3g470r_log = log.Log() - -@contextmanager -def opened_w_error(filename, mode="r"): - try: - f = open(filename, mode) - except IOError as err: - yield None, err - else: - try: - yield f, None - finally: - f.close() - -def open_url(url): - """ - Open an URL with the proxy and the user-agent - specified in the configuration file. - """ - if conf.HTTP_PROXY == "": - proxy = {} - else: - proxy = {"http" : conf.HTTP_PROXY} - opener = urllib.request.FancyURLopener(proxy) - try: - opener = urllib.request.build_opener() - opener.addheaders = [('User-agent', conf.USER_AGENT)] - return (True, opener.open(url)) - except urllib.error.HTTPError as e: - # server couldn't fulfill the request - error = (url, e.code, \ - http.server.BaseHTTPRequestHandler.responses[e.code][1]) - pyaggr3g470r_log.error(url + " " + str(e.code) + " " + \ - http.server.BaseHTTPRequestHandler.responses[e.code][1]) - return (False, error) - except urllib.error.URLError as e: - # failed to reach the server - if type(e.reason) == str: - error = (url, e.reason, e.reason) - pyaggr3g470r_log.error(url + " " + e.reason) - else: - error = (url, e.reason.errno, e.reason.strerror) - pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + \ - e.reason.strerror) - return (False, error) - -def generate_qr_code(article): - """ - Generated a QR Code for the article given in parameter. - """ - try: - os.makedirs("./var/qrcode/") - except OSError: - pass - if not os.path.isfile("./var/qrcode/" + article["article_id"] + ".png"): - # QR Code generation - try: - f = qr.QRUrl(url = article["article_link"]) - f.make() - f.save("./var/qrcode/" + article["article_id"] + ".png") - except: - pass - -def clear_string(data): - """ - Clear a string by removing HTML tags, HTML special caracters - and consecutive white spaces (more that one). - """ - p = re.compile(b'<[^>]+>') # HTML tags - q = re.compile(b'\s') # consecutive white spaces - return p.sub(b'', q.sub(b' ', bytes(data, "utf-8"))).decode("utf-8", "strict") - -def normalize_filename(name): - """ - Normalize a file name. - """ - file_name = re.sub("[,'!?|&]", "", name) - file_name = re.sub("[\s.]", "_", file_name) - file_name = file_name.strip('_') - file_name = file_name.strip('.') - return os.path.normpath(file_name) - -def load_stop_words(): - """ - Load the stop words and return them in a list. - """ - stop_words_lists = glob.glob('./var/stop_words/*.txt') - stop_words = [] - - for stop_wods_list in stop_words_lists: - with opened_w_error(stop_wods_list, "r") as (stop_wods_file, err): - if err: - stop_words = [] - else: - stop_words += stop_wods_file.read().split(";") - return stop_words - -def top_words(articles, n=10, size=5): - """ - Return the n most frequent words in a list. - """ - stop_words = load_stop_words() - words = Counter() - wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) - for article in articles: - for word in [elem.lower() for elem in - wordre.findall(clear_string(article["article_content"])) \ - if elem.lower() not in stop_words]: - words[word] += 1 - return words.most_common(n) - -def tag_cloud(tags, query="word_count"): - """ - Generates a tags cloud. - """ - tags.sort(key=operator.itemgetter(0)) - if query == "word_count": - # tags cloud from the management page - return ' '.join([('<font size=%d><a href="/search/?query=%s" title="Count: %s">%s</a></font>\n' % \ - (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \ - for (word, count) in tags]) - if query == "year": - # tags cloud for the history - return ' '.join([('<font size=%d><a href="/history/?query=%s:%s" title="Count: %s">%s</a></font>\n' % \ - (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), query, word, format(count, ',d'), word)) \ - for (word, count) in tags]) - return ' '.join([('<font size=%d><a href="/history/?query=%s:%s" title="Count: %s">%s</a></font>\n' % \ - (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), query, word, format(count, ',d'), calendar.month_name[int(word)])) \ - for (word, count) in tags]) - -def send_mail(mfrom, mto, feed_title, article_title, description): - """ - Send the article via mail. - """ - # Create the body of the message (a plain-text and an HTML version). - html = """<html>\n<head>\n<title>%s</title>\n</head>\n<body>\n%s\n</body>\n</html>""" % \ - (feed_title + ": " + article_title, description) - text = clear_string(description) - - # Create message container - the correct MIME type is multipart/alternative. - msg = MIMEMultipart('alternative') - msg['Subject'] = '[pyAggr3g470r] ' + feed_title + ": " + article_title - msg['From'] = mfrom - msg['To'] = mto - - # Record the MIME types of both parts - text/plain and text/html. - part1 = MIMEText(text, 'plain', 'utf-8') - part2 = MIMEText(html, 'html', 'utf-8') - - # Attach parts into message container. - # According to RFC 2046, the last part of a multipart message, in this case - # the HTML message, is best and preferred. - msg.attach(part1) - msg.attach(part2) - - # Send the message via local SMTP server. - try: - s = smtplib.SMTP(conf.smtp_server) - s.login(conf.username, conf.password) - except Exception as e: - print(e) - else: - s.send_message(msg) - s.quit() - -def add_feed(feed_url): - """ - Add the URL feed_url in the file feed.lst. - """ - with opened_w_error(conf.FEED_LIST, "r") as (f, err): - if err: - return False - else: - lines = f.readlines() - lines = list(map(str.strip, lines)) - if feed_url in lines: - return False - lines.append(feed_url) - with open(conf.FEED_LIST, "w") as f: - f.write("\n".join(lines)) - return True - -def change_feed_url(old_feed_url, new_feed_url): - """ - Change the URL of a feed given in parameter. - """ - # Replace the URL in the text file - with opened_w_error(conf.FEED_LIST, "r") as (f, err): - if err: - return False - else: - lines = f.readlines() - lines = list(map(str.strip, lines)) - try: - lines[lines.index(old_feed_url)] = new_feed_url - except: - return False - with opened_w_error(conf.FEED_LIST, "w") as (f, err): - if err: - return False - else: - f.write("\n".join(lines)) - return True - -def remove_feed(feed_url): - """ - Remove a feed from the file feed.lst and from the database. - """ - with opened_w_error(conf.FEED_LIST, "r") as (f, err): - if err: - return False - else: - lines = f.readlines() - lines = list(map(str.strip, lines)) - try: - del lines[lines.index(feed_url)] - except: - return False - with opened_w_error(conf.FEED_LIST, "w") as (f, err): - if err: - return False - else: - f.write("\n".join(lines)) - return True - -def search_feed(url): - """ - Search a feed in a HTML page. - """ - soup, page = None, None - try: - result = open_url(url) - if result[0] == True: - page = open_url(url)[1] - else: - return None - soup = BeautifulSoup(page) - except: - return None - feed_links = soup('link', type='application/atom+xml') - feed_links.extend(soup('link', type='application/rss+xml')) - for feed_link in feed_links: - if url not in feed_link['href']: - return urllib.parse.urljoin(url, feed_link['href']) - return feed_link['href'] - return None |