aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r--pyaggr3g470r/__init__.py3
-rw-r--r--pyaggr3g470r/crawler.py42
-rw-r--r--pyaggr3g470r/decorators.py7
-rw-r--r--pyaggr3g470r/export.py3
-rw-r--r--pyaggr3g470r/search.py41
-rwxr-xr-xpyaggr3g470r/utils.py24
-rw-r--r--pyaggr3g470r/views.py9
7 files changed, 79 insertions, 50 deletions
diff --git a/pyaggr3g470r/__init__.py b/pyaggr3g470r/__init__.py
index e948a187..a8211217 100644
--- a/pyaggr3g470r/__init__.py
+++ b/pyaggr3g470r/__init__.py
@@ -19,6 +19,7 @@ db = SQLAlchemy(app)
ALLOWED_EXTENSIONS = set(['xml', 'opml'])
+
def allowed_file(filename):
"""
Check if the uploaded WSW file is allowed.
@@ -34,7 +35,7 @@ if not conf.ON_HEROKU:
app.config["MAIL_USERNAME"] = conf.MAIL_USERNAME
app.config["MAIL_PASSWORD"] = conf.MAIL_PASSWORD
- from flask.ext.mail import Message, Mail
+ from flask.ext.mail import Mail
mail = Mail(app)
# Gravatar
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index dade3bea..ea149f5e 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -31,7 +31,6 @@ import urllib2
import requests
from requests.exceptions import *
#from requests.packages.urllib3.exceptions import DecodeError
-from urlparse import urlparse
from datetime import datetime
import gevent.monkey
@@ -51,7 +50,6 @@ requests_log.propagate = True
"""
-import models
import conf
if not conf.ON_HEROKU:
import search as fastsearch
@@ -60,9 +58,9 @@ import utils
if not conf.ON_HEROKU:
from flask.ext.mail import Message
from pyaggr3g470r import mail
-
-from pyaggr3g470r import app, db
-from pyaggr3g470r.models import User, Feed, Article
+
+from pyaggr3g470r import db
+from pyaggr3g470r.models import User, Article
import log
pyaggr3g470r_log = log.Log("feedgetter")
@@ -75,6 +73,7 @@ class TooLong(Exception):
"""
pyaggr3g470r_log.warning("Greenlet took to long")
+
class FeedGetter(object):
"""
This class is in charge of retrieving the feeds.
@@ -88,7 +87,7 @@ class FeedGetter(object):
self.proxy = urllib2.ProxyHandler({})
self.proxies = {}
else:
- self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \
+ self.proxy = urllib2.ProxyHandler({"http": conf.HTTP_PROXY,
"https": conf.HTTP_PROXY})
self.proxies = {
"http": "http://" + conf.HTTP_PROXY,
@@ -110,7 +109,8 @@ class FeedGetter(object):
feeds = [feed for feed in feeds if feed.id == feed_id]
# 2 - Fetch the feeds.
- # 'responses' contains all the jobs returned by the function retrieve_async()
+ # 'responses' contains all the jobs returned by
+ # the function retrieve_async()
responses = self.retrieve_async(feeds)
elements = [item.value for item in responses if item.value is not None]
@@ -133,7 +133,7 @@ class FeedGetter(object):
Fetch a feed.
"""
pyaggr3g470r_log.info("Fetching the feed:" + feed.title)
- a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
+ a_feed = feedparser.parse(feed.link, handlers=[self.proxy])
if a_feed['entries'] == []:
return
@@ -155,14 +155,20 @@ class FeedGetter(object):
nice_url = article.link.encode("utf-8")
if conf.RESOLVE_ARTICLE_URL:
try:
- # resolves URL behind proxies (like feedproxy.google.com)
- r = requests.get(article.link, timeout=5.0, proxies=self.proxies)
+ # resolves URL behind proxies
+ # (like feedproxy.google.com)
+ r = requests.get(article.link, timeout=5.0,
+ proxies=self.proxies)
nice_url = r.url.encode("utf-8")
except Timeout:
- pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,))
+ pyaggr3g470r_log.\
+ warning("Timeout when getting the real URL of %s." %
+ (article.link,))
continue
except Exception as e:
- pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e)))
+ pyaggr3g470r_log.\
+ warning("Unable to get the real URL of %s. Error: %s" %
+ (article.link, str(e)))
continue
# remove utm_* parameters
nice_url = utils.clean_url(nice_url)
@@ -181,7 +187,7 @@ class FeedGetter(object):
try:
description = BeautifulSoup(description, "html.parser").decode()
article_title = BeautifulSoup(article.title, "html.parser").decode()
- except Exception as E:
+ except Exception:
pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
article_title = article.title
@@ -215,7 +221,9 @@ class FeedGetter(object):
for article in articles:
- exist = Article.query.filter(Article.user_id == self.user.id, Article.feed_id == feed.id, Article.link == article.link).first()
+ exist = Article.query.filter(Article.user_id == self.user.id,
+ Article.feed_id == feed.id,
+ Article.link == article.link).first()
if exist != None:
pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article.title, article.link))
continue
@@ -242,9 +250,11 @@ class FeedGetter(object):
pyaggr3g470r_log.info("Indexing new articles.")
for feed, articles in elements:
for element in articles:
- article = Article.query.filter(Article.user_id == self.user.id, Article.link == element.link).first()
+ article = Article.query.filter(Article.user_id == self.user.id,
+ Article.link == element.link).first()
try:
- fastsearch.add_to_index(self.user.id, [article], article.source)
+ fastsearch.add_to_index(self.user.id, [article],
+ article.source)
except:
pyaggr3g470r_log.error("Problem during indexation.")
return True \ No newline at end of file
diff --git a/pyaggr3g470r/decorators.py b/pyaggr3g470r/decorators.py
index 565d25a6..a32e9709 100644
--- a/pyaggr3g470r/decorators.py
+++ b/pyaggr3g470r/decorators.py
@@ -8,16 +8,19 @@ from flask import g, redirect, url_for, flash
from pyaggr3g470r.models import Feed
+
def async(f):
def wrapper(*args, **kwargs):
- thr = Thread(target = f, args = args, kwargs = kwargs)
+ thr = Thread(target=f, args=args, kwargs=kwargs)
thr.start()
return wrapper
+
def feed_access_required(func):
"""
This decorator enables to check if a user has access to a feed.
- The administrator of the platform is able to access to the feeds of a normal user.
+ The administrator of the platform is able to access to the feeds
+ of a normal user.
"""
@wraps(func)
def decorated(*args, **kwargs):
diff --git a/pyaggr3g470r/export.py b/pyaggr3g470r/export.py
index e7978e7c..243b6843 100644
--- a/pyaggr3g470r/export.py
+++ b/pyaggr3g470r/export.py
@@ -40,9 +40,9 @@ import tarfile
from datetime import datetime
import conf
-import utils
import models
+
def HTML_HEADER(title="pyAggr3g470r", css="./style.css"):
return """<!DOCTYPE html>
<html lang="en-US">
@@ -132,6 +132,7 @@ img {
margin:1.00em 1.00em;
}"""
+
def export_html(user):
"""
Export all articles of 'user' in Web pages.
diff --git a/pyaggr3g470r/search.py b/pyaggr3g470r/search.py
index 6f8168db..f4e57251 100644
--- a/pyaggr3g470r/search.py
+++ b/pyaggr3g470r/search.py
@@ -37,16 +37,16 @@ from whoosh.writing import AsyncWriter
from collections import defaultdict
import utils
-import models
indexdir = "./pyaggr3g470r/var/indexdir"
-schema = Schema(title=TEXT, \
- content=TEXT, \
- article_id=NUMERIC(int, stored=True), \
- feed_id=NUMERIC(int, stored=True), \
+schema = Schema(title=TEXT,
+ content=TEXT,
+ article_id=NUMERIC(int, stored=True),
+ feed_id=NUMERIC(int, stored=True),
user_id=NUMERIC(int, stored=True))
+
def create_index(user):
"""
Creates the index.
@@ -57,13 +57,14 @@ def create_index(user):
writer = ix.writer()
for feed in user.feeds:
for article in feed.articles:
- writer.add_document(title=article.title, \
- content=utils.clear_string(article.content), \
- article_id=article.id, \
- feed_id=feed.id, \
+ writer.add_document(title=article.title,
+ content=utils.clear_string(article.content),
+ article_id=article.id,
+ feed_id=feed.id,
user_id=user.id)
writer.commit()
+
def add_to_index(user_id, articles, feed):
"""
Add a list of articles to the index.
@@ -72,32 +73,35 @@ def add_to_index(user_id, articles, feed):
"""
try:
ix = open_dir(indexdir)
- except (EmptyIndexError, OSError) as e:
+ except (EmptyIndexError, OSError):
if not os.path.exists(indexdir):
os.makedirs(indexdir)
ix = create_in(indexdir, schema)
writer = AsyncWriter(ix)
for article in articles:
- writer.add_document(title=article.title, \
- content=utils.clear_string(article.content), \
- article_id=article.id, \
- feed_id=feed.id, \
+ writer.add_document(title=article.title,
+ content=utils.clear_string(article.content),
+ article_id=article.id,
+ feed_id=feed.id,
user_id=user_id)
writer.commit()
+
def delete_article(user_id, feed_id, article_id):
"""
Delete an article from the index.
"""
try:
ix = open_dir(indexdir)
- except (EmptyIndexError, OSError) as e:
+ except (EmptyIndexError, OSError):
raise EmptyIndexError
writer = ix.writer()
- document = And([Term("user_id", user_id), Term("feed_id", feed_id), Term("article_id", article_id)])
+ document = And([Term("user_id", user_id), Term("feed_id", feed_id),
+ Term("article_id", article_id)])
writer.delete_by_query(document)
writer.commit()
+
def search(user_id, term):
"""
Search for `term` in the index.
@@ -106,7 +110,7 @@ def search(user_id, term):
result_dict = defaultdict(list)
try:
ix = open_dir(indexdir)
- except (EmptyIndexError, OSError) as e:
+ except (EmptyIndexError, OSError):
raise EmptyIndexError
with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(term)
@@ -115,13 +119,14 @@ def search(user_id, term):
result_dict[article["feed_id"]].append(article["article_id"])
return result_dict, len(results)
+
def nb_documents():
"""
Return the number of undeleted documents.
"""
try:
ix = open_dir(indexdir)
- except (EmptyIndexError, OSError) as e:
+ except (EmptyIndexError, OSError):
raise EmptyIndexError
return ix.doc_count()
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py
index ce210c20..88a3904a 100755
--- a/pyaggr3g470r/utils.py
+++ b/pyaggr3g470r/utils.py
@@ -34,12 +34,10 @@ __license__ = "AGPLv3"
# - e-mail notifications.
#
-import os
import re
import glob
import opml
import operator
-import calendar
from urllib import urlencode
from urlparse import urlparse, parse_qs, urlunparse
from BeautifulSoup import BeautifulSoup
@@ -52,7 +50,7 @@ from pyaggr3g470r import db
from pyaggr3g470r.models import User, Feed
# regular expression to check URL
-url_finders = [ \
+url_finders = [
re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \
re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \
re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \
@@ -62,6 +60,7 @@ url_finders = [ \
#import log
#pyaggr3g470r_log = log.Log()
+
@contextmanager
def opened_w_error(filename, mode="r"):
try:
@@ -74,6 +73,7 @@ def opened_w_error(filename, mode="r"):
finally:
f.close()
+
def import_opml(email, opml_file):
"""
Import new feeds from an OPML file.
@@ -89,11 +89,11 @@ def import_opml(email, opml_file):
Parse recursively through the categories and sub-categories.
"""
for subscription in subsubscription:
-
+
if len(subscription) != 0:
nb = read(subscription, nb)
else:
-
+
try:
title = subscription.text
@@ -118,7 +118,9 @@ def import_opml(email, opml_file):
except:
site_link = ""
- new_feed = Feed(title=title, description=description, link=link, site_link=site_link, email_notification=False, enabled=True)
+ new_feed = Feed(title=title, description=description,
+ link=link, site_link=site_link,
+ email_notification=False, enabled=True)
user.feeds.append(new_feed)
nb += 1
@@ -128,13 +130,15 @@ def import_opml(email, opml_file):
db.session.commit()
return nb
+
def clean_url(url):
"""
Remove utm_* parameters
"""
parsed_url = urlparse(url)
qd = parse_qs(parsed_url.query, keep_blank_values=True)
- filtered = dict((k, v) for k, v in qd.iteritems() if not k.startswith('utm_'))
+ filtered = dict((k, v) for k, v in qd.iteritems()
+ if not k.startswith('utm_'))
nice_url = urlunparse([
parsed_url.scheme,
parsed_url.netloc,
@@ -145,6 +149,7 @@ def clean_url(url):
])
return nice_url
+
def open_url(url):
"""
Open an URL with the proxy and the user-agent
@@ -175,6 +180,7 @@ def open_url(url):
#pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + e.reason.strerror)
return (False, error)
+
def clear_string(data):
"""
Clear a string by removing HTML tags, HTML special caracters
@@ -184,6 +190,7 @@ def clear_string(data):
q = re.compile('\s') # consecutive white spaces
return p.sub('', q.sub(' ', data))
+
def load_stop_words():
"""
Load the stop words and return them in a list.
@@ -199,6 +206,7 @@ def load_stop_words():
stop_words += stop_wods_file.read().split(";")
return stop_words
+
def top_words(articles, n=10, size=5):
"""
Return the n most frequent words in a list.
@@ -213,6 +221,7 @@ def top_words(articles, n=10, size=5):
words[word] += 1
return words.most_common(n)
+
def tag_cloud(tags):
"""
Generates a tags cloud.
@@ -222,6 +231,7 @@ def tag_cloud(tags):
(min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \
for (word, count) in tags])
+
def search_feed(url):
"""
Search a feed in a HTML page.
diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py
index 514d1920..02ef4f9e 100644
--- a/pyaggr3g470r/views.py
+++ b/pyaggr3g470r/views.py
@@ -38,7 +38,6 @@ from werkzeug import generate_password_hash
import conf
import utils
import export
-import models
if not conf.ON_HEROKU:
import search as fastsearch
from forms import SigninForm, AddFeedForm, ProfileForm
@@ -103,7 +102,7 @@ def page_not_found(e):
return render_template('errors/404.html'), 404
@app.errorhandler(500)
-def page_not_found(e):
+def internal_server_error(e):
return render_template('errors/500.html'), 500
@@ -166,7 +165,7 @@ def home():
new_feed.id = feed.id
new_feed.title = feed.title
new_feed.enabled = feed.enabled
- new_feed.articles = Article.query.filter(Article.user_id == g.user.id,
+ new_feed.articles = Article.query.filter(Article.user_id == g.user.id,
Article.feed_id == feed.id).order_by(desc("Article.date")).limit(9)
result.append(new_feed)
unread_articles = len(Article.query.filter(Article.user_id == g.user.id, Article.readed == False).all())
@@ -247,7 +246,7 @@ def article(article_id=None):
return render_template('article.html', head_title=utils.clear_string(article.title), article=article)
flash("This article do not exist.", 'warning')
return redirect(redirect_url())
-
+
@app.route('/mark_as_read/', methods=['GET'])
@app.route('/mark_as_read/<int:feed_id>', methods=['GET'])
@@ -473,7 +472,7 @@ def management():
data = request.files.get('opmlfile', None)
if None == data or not allowed_file(data.filename):
flash('File not allowed.', 'danger')
- else:
+ else:
opml_path = os.path.join("./pyaggr3g470r/var/", data.filename)
data.save(opml_path)
try:
bgstack15