diff options
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/__init__.py | 3 | ||||
-rw-r--r-- | pyaggr3g470r/crawler.py | 42 | ||||
-rw-r--r-- | pyaggr3g470r/decorators.py | 7 | ||||
-rw-r--r-- | pyaggr3g470r/export.py | 3 | ||||
-rw-r--r-- | pyaggr3g470r/search.py | 41 | ||||
-rwxr-xr-x | pyaggr3g470r/utils.py | 24 | ||||
-rw-r--r-- | pyaggr3g470r/views.py | 9 |
7 files changed, 79 insertions, 50 deletions
diff --git a/pyaggr3g470r/__init__.py b/pyaggr3g470r/__init__.py index e948a187..a8211217 100644 --- a/pyaggr3g470r/__init__.py +++ b/pyaggr3g470r/__init__.py @@ -19,6 +19,7 @@ db = SQLAlchemy(app) ALLOWED_EXTENSIONS = set(['xml', 'opml']) + def allowed_file(filename): """ Check if the uploaded WSW file is allowed. @@ -34,7 +35,7 @@ if not conf.ON_HEROKU: app.config["MAIL_USERNAME"] = conf.MAIL_USERNAME app.config["MAIL_PASSWORD"] = conf.MAIL_PASSWORD - from flask.ext.mail import Message, Mail + from flask.ext.mail import Mail mail = Mail(app) # Gravatar diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index dade3bea..ea149f5e 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -31,7 +31,6 @@ import urllib2 import requests from requests.exceptions import * #from requests.packages.urllib3.exceptions import DecodeError -from urlparse import urlparse from datetime import datetime import gevent.monkey @@ -51,7 +50,6 @@ requests_log.propagate = True """ -import models import conf if not conf.ON_HEROKU: import search as fastsearch @@ -60,9 +58,9 @@ import utils if not conf.ON_HEROKU: from flask.ext.mail import Message from pyaggr3g470r import mail - -from pyaggr3g470r import app, db -from pyaggr3g470r.models import User, Feed, Article + +from pyaggr3g470r import db +from pyaggr3g470r.models import User, Article import log pyaggr3g470r_log = log.Log("feedgetter") @@ -75,6 +73,7 @@ class TooLong(Exception): """ pyaggr3g470r_log.warning("Greenlet took to long") + class FeedGetter(object): """ This class is in charge of retrieving the feeds. @@ -88,7 +87,7 @@ class FeedGetter(object): self.proxy = urllib2.ProxyHandler({}) self.proxies = {} else: - self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \ + self.proxy = urllib2.ProxyHandler({"http": conf.HTTP_PROXY, "https": conf.HTTP_PROXY}) self.proxies = { "http": "http://" + conf.HTTP_PROXY, @@ -110,7 +109,8 @@ class FeedGetter(object): feeds = [feed for feed in feeds if feed.id == feed_id] # 2 - Fetch the feeds. - # 'responses' contains all the jobs returned by the function retrieve_async() + # 'responses' contains all the jobs returned by + # the function retrieve_async() responses = self.retrieve_async(feeds) elements = [item.value for item in responses if item.value is not None] @@ -133,7 +133,7 @@ class FeedGetter(object): Fetch a feed. """ pyaggr3g470r_log.info("Fetching the feed:" + feed.title) - a_feed = feedparser.parse(feed.link, handlers = [self.proxy]) + a_feed = feedparser.parse(feed.link, handlers=[self.proxy]) if a_feed['entries'] == []: return @@ -155,14 +155,20 @@ class FeedGetter(object): nice_url = article.link.encode("utf-8") if conf.RESOLVE_ARTICLE_URL: try: - # resolves URL behind proxies (like feedproxy.google.com) - r = requests.get(article.link, timeout=5.0, proxies=self.proxies) + # resolves URL behind proxies + # (like feedproxy.google.com) + r = requests.get(article.link, timeout=5.0, + proxies=self.proxies) nice_url = r.url.encode("utf-8") except Timeout: - pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,)) + pyaggr3g470r_log.\ + warning("Timeout when getting the real URL of %s." % + (article.link,)) continue except Exception as e: - pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e))) + pyaggr3g470r_log.\ + warning("Unable to get the real URL of %s. Error: %s" % + (article.link, str(e))) continue # remove utm_* parameters nice_url = utils.clean_url(nice_url) @@ -181,7 +187,7 @@ class FeedGetter(object): try: description = BeautifulSoup(description, "html.parser").decode() article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception as E: + except Exception: pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) article_title = article.title @@ -215,7 +221,9 @@ class FeedGetter(object): for article in articles: - exist = Article.query.filter(Article.user_id == self.user.id, Article.feed_id == feed.id, Article.link == article.link).first() + exist = Article.query.filter(Article.user_id == self.user.id, + Article.feed_id == feed.id, + Article.link == article.link).first() if exist != None: pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article.title, article.link)) continue @@ -242,9 +250,11 @@ class FeedGetter(object): pyaggr3g470r_log.info("Indexing new articles.") for feed, articles in elements: for element in articles: - article = Article.query.filter(Article.user_id == self.user.id, Article.link == element.link).first() + article = Article.query.filter(Article.user_id == self.user.id, + Article.link == element.link).first() try: - fastsearch.add_to_index(self.user.id, [article], article.source) + fastsearch.add_to_index(self.user.id, [article], + article.source) except: pyaggr3g470r_log.error("Problem during indexation.") return True
\ No newline at end of file diff --git a/pyaggr3g470r/decorators.py b/pyaggr3g470r/decorators.py index 565d25a6..a32e9709 100644 --- a/pyaggr3g470r/decorators.py +++ b/pyaggr3g470r/decorators.py @@ -8,16 +8,19 @@ from flask import g, redirect, url_for, flash from pyaggr3g470r.models import Feed + def async(f): def wrapper(*args, **kwargs): - thr = Thread(target = f, args = args, kwargs = kwargs) + thr = Thread(target=f, args=args, kwargs=kwargs) thr.start() return wrapper + def feed_access_required(func): """ This decorator enables to check if a user has access to a feed. - The administrator of the platform is able to access to the feeds of a normal user. + The administrator of the platform is able to access to the feeds + of a normal user. """ @wraps(func) def decorated(*args, **kwargs): diff --git a/pyaggr3g470r/export.py b/pyaggr3g470r/export.py index e7978e7c..243b6843 100644 --- a/pyaggr3g470r/export.py +++ b/pyaggr3g470r/export.py @@ -40,9 +40,9 @@ import tarfile from datetime import datetime import conf -import utils import models + def HTML_HEADER(title="pyAggr3g470r", css="./style.css"): return """<!DOCTYPE html> <html lang="en-US"> @@ -132,6 +132,7 @@ img { margin:1.00em 1.00em; }""" + def export_html(user): """ Export all articles of 'user' in Web pages. diff --git a/pyaggr3g470r/search.py b/pyaggr3g470r/search.py index 6f8168db..f4e57251 100644 --- a/pyaggr3g470r/search.py +++ b/pyaggr3g470r/search.py @@ -37,16 +37,16 @@ from whoosh.writing import AsyncWriter from collections import defaultdict import utils -import models indexdir = "./pyaggr3g470r/var/indexdir" -schema = Schema(title=TEXT, \ - content=TEXT, \ - article_id=NUMERIC(int, stored=True), \ - feed_id=NUMERIC(int, stored=True), \ +schema = Schema(title=TEXT, + content=TEXT, + article_id=NUMERIC(int, stored=True), + feed_id=NUMERIC(int, stored=True), user_id=NUMERIC(int, stored=True)) + def create_index(user): """ Creates the index. @@ -57,13 +57,14 @@ def create_index(user): writer = ix.writer() for feed in user.feeds: for article in feed.articles: - writer.add_document(title=article.title, \ - content=utils.clear_string(article.content), \ - article_id=article.id, \ - feed_id=feed.id, \ + writer.add_document(title=article.title, + content=utils.clear_string(article.content), + article_id=article.id, + feed_id=feed.id, user_id=user.id) writer.commit() + def add_to_index(user_id, articles, feed): """ Add a list of articles to the index. @@ -72,32 +73,35 @@ def add_to_index(user_id, articles, feed): """ try: ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: + except (EmptyIndexError, OSError): if not os.path.exists(indexdir): os.makedirs(indexdir) ix = create_in(indexdir, schema) writer = AsyncWriter(ix) for article in articles: - writer.add_document(title=article.title, \ - content=utils.clear_string(article.content), \ - article_id=article.id, \ - feed_id=feed.id, \ + writer.add_document(title=article.title, + content=utils.clear_string(article.content), + article_id=article.id, + feed_id=feed.id, user_id=user_id) writer.commit() + def delete_article(user_id, feed_id, article_id): """ Delete an article from the index. """ try: ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: + except (EmptyIndexError, OSError): raise EmptyIndexError writer = ix.writer() - document = And([Term("user_id", user_id), Term("feed_id", feed_id), Term("article_id", article_id)]) + document = And([Term("user_id", user_id), Term("feed_id", feed_id), + Term("article_id", article_id)]) writer.delete_by_query(document) writer.commit() + def search(user_id, term): """ Search for `term` in the index. @@ -106,7 +110,7 @@ def search(user_id, term): result_dict = defaultdict(list) try: ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: + except (EmptyIndexError, OSError): raise EmptyIndexError with ix.searcher() as searcher: query = QueryParser("content", ix.schema).parse(term) @@ -115,13 +119,14 @@ def search(user_id, term): result_dict[article["feed_id"]].append(article["article_id"]) return result_dict, len(results) + def nb_documents(): """ Return the number of undeleted documents. """ try: ix = open_dir(indexdir) - except (EmptyIndexError, OSError) as e: + except (EmptyIndexError, OSError): raise EmptyIndexError return ix.doc_count() diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py index ce210c20..88a3904a 100755 --- a/pyaggr3g470r/utils.py +++ b/pyaggr3g470r/utils.py @@ -34,12 +34,10 @@ __license__ = "AGPLv3" # - e-mail notifications. # -import os import re import glob import opml import operator -import calendar from urllib import urlencode from urlparse import urlparse, parse_qs, urlunparse from BeautifulSoup import BeautifulSoup @@ -52,7 +50,7 @@ from pyaggr3g470r import db from pyaggr3g470r.models import User, Feed # regular expression to check URL -url_finders = [ \ +url_finders = [ re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \ re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \ re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \ @@ -62,6 +60,7 @@ url_finders = [ \ #import log #pyaggr3g470r_log = log.Log() + @contextmanager def opened_w_error(filename, mode="r"): try: @@ -74,6 +73,7 @@ def opened_w_error(filename, mode="r"): finally: f.close() + def import_opml(email, opml_file): """ Import new feeds from an OPML file. @@ -89,11 +89,11 @@ def import_opml(email, opml_file): Parse recursively through the categories and sub-categories. """ for subscription in subsubscription: - + if len(subscription) != 0: nb = read(subscription, nb) else: - + try: title = subscription.text @@ -118,7 +118,9 @@ def import_opml(email, opml_file): except: site_link = "" - new_feed = Feed(title=title, description=description, link=link, site_link=site_link, email_notification=False, enabled=True) + new_feed = Feed(title=title, description=description, + link=link, site_link=site_link, + email_notification=False, enabled=True) user.feeds.append(new_feed) nb += 1 @@ -128,13 +130,15 @@ def import_opml(email, opml_file): db.session.commit() return nb + def clean_url(url): """ Remove utm_* parameters """ parsed_url = urlparse(url) qd = parse_qs(parsed_url.query, keep_blank_values=True) - filtered = dict((k, v) for k, v in qd.iteritems() if not k.startswith('utm_')) + filtered = dict((k, v) for k, v in qd.iteritems() + if not k.startswith('utm_')) nice_url = urlunparse([ parsed_url.scheme, parsed_url.netloc, @@ -145,6 +149,7 @@ def clean_url(url): ]) return nice_url + def open_url(url): """ Open an URL with the proxy and the user-agent @@ -175,6 +180,7 @@ def open_url(url): #pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + e.reason.strerror) return (False, error) + def clear_string(data): """ Clear a string by removing HTML tags, HTML special caracters @@ -184,6 +190,7 @@ def clear_string(data): q = re.compile('\s') # consecutive white spaces return p.sub('', q.sub(' ', data)) + def load_stop_words(): """ Load the stop words and return them in a list. @@ -199,6 +206,7 @@ def load_stop_words(): stop_words += stop_wods_file.read().split(";") return stop_words + def top_words(articles, n=10, size=5): """ Return the n most frequent words in a list. @@ -213,6 +221,7 @@ def top_words(articles, n=10, size=5): words[word] += 1 return words.most_common(n) + def tag_cloud(tags): """ Generates a tags cloud. @@ -222,6 +231,7 @@ def tag_cloud(tags): (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \ for (word, count) in tags]) + def search_feed(url): """ Search a feed in a HTML page. diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py index 514d1920..02ef4f9e 100644 --- a/pyaggr3g470r/views.py +++ b/pyaggr3g470r/views.py @@ -38,7 +38,6 @@ from werkzeug import generate_password_hash import conf import utils import export -import models if not conf.ON_HEROKU: import search as fastsearch from forms import SigninForm, AddFeedForm, ProfileForm @@ -103,7 +102,7 @@ def page_not_found(e): return render_template('errors/404.html'), 404 @app.errorhandler(500) -def page_not_found(e): +def internal_server_error(e): return render_template('errors/500.html'), 500 @@ -166,7 +165,7 @@ def home(): new_feed.id = feed.id new_feed.title = feed.title new_feed.enabled = feed.enabled - new_feed.articles = Article.query.filter(Article.user_id == g.user.id, + new_feed.articles = Article.query.filter(Article.user_id == g.user.id, Article.feed_id == feed.id).order_by(desc("Article.date")).limit(9) result.append(new_feed) unread_articles = len(Article.query.filter(Article.user_id == g.user.id, Article.readed == False).all()) @@ -247,7 +246,7 @@ def article(article_id=None): return render_template('article.html', head_title=utils.clear_string(article.title), article=article) flash("This article do not exist.", 'warning') return redirect(redirect_url()) - + @app.route('/mark_as_read/', methods=['GET']) @app.route('/mark_as_read/<int:feed_id>', methods=['GET']) @@ -473,7 +472,7 @@ def management(): data = request.files.get('opmlfile', None) if None == data or not allowed_file(data.filename): flash('File not allowed.', 'danger') - else: + else: opml_path = os.path.join("./pyaggr3g470r/var/", data.filename) data.save(opml_path) try: |