diff options
author | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-04-22 11:06:27 +0200 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel+bitbucket@gmail.com> | 2015-04-22 11:06:27 +0200 |
commit | 4fa09afdb7465db6730cb69a9f99279afdb0cf87 (patch) | |
tree | ec221bc4fc68389fb58672cd01b34bf1740c43b0 /pyaggr3g470r | |
parent | Updated NEWS.rst (diff) | |
parent | impacting wosh suppression to controller (diff) | |
download | newspipe-4fa09afdb7465db6730cb69a9f99279afdb0cf87.tar.gz newspipe-4fa09afdb7465db6730cb69a9f99279afdb0cf87.tar.bz2 newspipe-4fa09afdb7465db6730cb69a9f99279afdb0cf87.zip |
Merged in jaesivsm/pyaggr3g470r (pull request #11)
misc improvement in http crawler
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/controllers/abstract.py | 4 | ||||
-rw-r--r-- | pyaggr3g470r/controllers/article.py | 8 | ||||
-rw-r--r-- | pyaggr3g470r/controllers/feed.py | 10 | ||||
-rw-r--r-- | pyaggr3g470r/lib/crawler.py | 15 | ||||
-rw-r--r-- | pyaggr3g470r/models/feed.py | 1 | ||||
-rw-r--r-- | pyaggr3g470r/templates/layout.html | 9 | ||||
-rwxr-xr-x | pyaggr3g470r/utils.py | 21 | ||||
-rw-r--r-- | pyaggr3g470r/views/api/article.py | 3 | ||||
-rw-r--r-- | pyaggr3g470r/views/api/common.py | 7 | ||||
-rw-r--r-- | pyaggr3g470r/views/api/feed.py | 21 | ||||
-rw-r--r-- | pyaggr3g470r/views/article.py | 2 | ||||
-rw-r--r-- | pyaggr3g470r/views/feed.py | 2 | ||||
-rw-r--r-- | pyaggr3g470r/views/views.py | 17 |
13 files changed, 71 insertions, 49 deletions
diff --git a/pyaggr3g470r/controllers/abstract.py b/pyaggr3g470r/controllers/abstract.py index 9a9004af..95f9e211 100644 --- a/pyaggr3g470r/controllers/abstract.py +++ b/pyaggr3g470r/controllers/abstract.py @@ -70,7 +70,9 @@ class AbstractController(object): def create(self, **attrs): assert self._user_id_key in attrs or self.user_id is not None, \ "You must provide user_id one way or another" - attrs[self._user_id_key] = self.user_id or attrs.get(self._user_id_key) + + if self._user_id_key not in attrs: + attrs[self._user_id_key] = self.user_id obj = self._db_cls(**attrs) db.session.add(obj) db.session.commit() diff --git a/pyaggr3g470r/controllers/article.py b/pyaggr3g470r/controllers/article.py index 0ec53a2f..bcd73e99 100644 --- a/pyaggr3g470r/controllers/article.py +++ b/pyaggr3g470r/controllers/article.py @@ -1,7 +1,6 @@ from sqlalchemy import func from bootstrap import db -import conf from .abstract import AbstractController from pyaggr3g470r.models import Article @@ -15,13 +14,6 @@ class ArticleController(AbstractController): self.update({'id': article.id}, {'readed': True}) return article - def delete(self, obj_id): - obj = super(ArticleController, self).delete(obj_id) - if not conf.ON_HEROKU: - import pyaggr3g470r.search as fastsearch - fastsearch.delete_article(self.user_id, obj.feed_id, obj_id) - return obj - def challenge(self, ids): """Will return each id that wasn't found in the database.""" for id_ in ids: diff --git a/pyaggr3g470r/controllers/feed.py b/pyaggr3g470r/controllers/feed.py index 8db279ae..82714e39 100644 --- a/pyaggr3g470r/controllers/feed.py +++ b/pyaggr3g470r/controllers/feed.py @@ -27,8 +27,9 @@ from .abstract import AbstractController from pyaggr3g470r.models import Feed logger = logging.getLogger(__name__) -DEFAULT_MAX_ERROR = conf.DEFAULT_MAX_ERROR DEFAULT_LIMIT = 5 +DEFAULT_REFRESH_RATE = 60 +DEFAULT_MAX_ERROR = conf.DEFAULT_MAX_ERROR class FeedController(AbstractController): @@ -42,11 +43,10 @@ class FeedController(AbstractController): .order_by('Feed.last_retrieved') .limit(limit)] - def list_fetchable(self, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT): - from pyaggr3g470r.controllers import UserController + def list_fetchable(self, max_error=DEFAULT_MAX_ERROR, limit=DEFAULT_LIMIT, + refresh_rate=DEFAULT_REFRESH_RATE): now = datetime.now() - user = UserController(self.user_id).get(id=self.user_id) - max_last = now - timedelta(minutes=user.refresh_rate or 60) + max_last = now - timedelta(minutes=refresh_rate) feeds = self.list_late(max_last, max_error, limit) if feeds: self.update({'id__in': [feed.id for feed in feeds]}, diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py index 1cb61973..339c4b12 100644 --- a/pyaggr3g470r/lib/crawler.py +++ b/pyaggr3g470r/lib/crawler.py @@ -16,7 +16,6 @@ import time import conf import json import logging -import requests import feedparser import dateutil.parser from hashlib import md5 @@ -97,6 +96,7 @@ class AbstractCrawler: @classmethod def get_counter_callback(cls): cls.__counter__ += 1 + def debump(*args, **kwargs): cls.__counter__ -= 1 return debump @@ -157,6 +157,7 @@ class PyAggUpdater(AbstractCrawler): content = entry['summary'] return {'feed_id': self.feed['id'], + 'user_id': self.feed['user_id'], 'entry_id': extract_id(entry).get('entry_id', None), 'link': entry.get('link', self.feed['site_link']), 'title': entry.get('title', 'No title'), @@ -176,11 +177,11 @@ class PyAggUpdater(AbstractCrawler): for id_to_create in results: entry = self.to_article( self.entries[tuple(sorted(id_to_create.items()))]) - logger.warn('%r %r - creating %r - %r', self.feed['id'], - self.feed['title'], entry['title'], id_to_create) + logger.warn('%r %r - creating %r for %r - %r', self.feed['id'], + self.feed['title'], entry['title'], entry['user_id'], + id_to_create) self.query_pyagg('post', 'article', entry) - now = datetime.now() logger.debug('%r %r - updating feed etag %r last_mod %r', self.feed['id'], self.feed['title'], self.headers.get('etag', ''), @@ -264,8 +265,10 @@ class FeedCrawler(AbstractCrawler): ids, entries = [], {} parsed_response = feedparser.parse(response.text) for entry in parsed_response['entries']: - entries[tuple(sorted(extract_id(entry).items()))] = entry - ids.append(extract_id(entry)) + entry_ids = extract_id(entry) + entry_ids['feed_id'] = self.feed['id'] + entries[tuple(sorted(entry_ids.items()))] = entry + ids.append(entry_ids) logger.debug('%r %r - found %d entries %r', self.feed['id'], self.feed['title'], len(ids), ids) future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) diff --git a/pyaggr3g470r/models/feed.py b/pyaggr3g470r/models/feed.py index aff11460..e43045f1 100644 --- a/pyaggr3g470r/models/feed.py +++ b/pyaggr3g470r/models/feed.py @@ -63,6 +63,7 @@ class Feed(db.Model): def dump(self): return {"id": self.id, + "user_id": self.user_id, "title": self.title, "description": self.description, "link": self.link, diff --git a/pyaggr3g470r/templates/layout.html b/pyaggr3g470r/templates/layout.html index fba29ae3..1be20f35 100644 --- a/pyaggr3g470r/templates/layout.html +++ b/pyaggr3g470r/templates/layout.html @@ -6,7 +6,7 @@ <meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="description" content="pyAggr3g470r is a web-based news aggregator." /> <meta name="author" content="" /> - <title>pyAggr3g470r{% if head_title %} - {{ head_title }}{% endif %}</title> + <title>pyAggr3g470r{% if head_titles %} - {{ ' - '.join(head_titles) }}{% endif %}</title> <link rel="shortcut icon" href="{{ url_for('static', filename='img/favicon.png') }}" /> <!-- Bootstrap core CSS --> <link href="{{ url_for('static', filename='css/bootstrap.css') }}" rel="stylesheet" media="screen" /> @@ -26,7 +26,12 @@ <span class="icon-bar"></span> </button> <a class="navbar-brand" href="{{ url_for("home") }}">pyAggr3g470r</a> - <span class="navbar-brand">{% if head_title %} - {{ head_title }}{% endif %}</span> + {% if head_titles %} + {% for head_title in head_titles %} + <span class="navbar-brand"> - </span> + <span class="navbar-brand">{{ head_title }}</span> + {% endfor %} + {% endif %} </div> <!-- Collect the nav links, forms, and other content for toggling --> diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py index ea8a87bf..628703d2 100755 --- a/pyaggr3g470r/utils.py +++ b/pyaggr3g470r/utils.py @@ -172,7 +172,7 @@ def import_json(email, json_content): description="", link=feed["link"], site_link=feed["site_link"], - created_date=datetime.datetime.\ + created_date=datetime.datetime. fromtimestamp(int(feed["created_date"])), enabled=feed["enabled"]) user.feeds.append(new_feed) @@ -191,10 +191,10 @@ def import_json(email, json_content): title=article["title"], content=article["content"], readed=article["readed"], - like=article["like"], \ - retrieved_date=datetime.datetime.\ + like=article["like"], + retrieved_date=datetime.datetime. fromtimestamp(int(article["retrieved_date"])), - date=datetime.datetime.\ + date=datetime.datetime. fromtimestamp(int(article["date"])), user_id=user.id, feed_id=user_feed.id) @@ -228,7 +228,7 @@ def open_url(url): if conf.HTTP_PROXY == "": proxy = {} else: - proxy = {"http" : conf.HTTP_PROXY} + proxy = {"http": conf.HTTP_PROXY} opener = urllib.request.FancyURLopener(proxy) try: opener = urllib.request.build_opener() @@ -236,27 +236,28 @@ def open_url(url): return (True, opener.open(url)) except urllib.error.HTTPError as e: # server couldn't fulfill the request - error = (url, e.code, \ - http.server.BaseHTTPRequestHandler.responses[e.code][1]) + error = (url, e.code, + http.server.BaseHTTPRequestHandler.responses[e.code][1]) return (False, error) except urllib.error.URLError as e: # failed to reach the server if type(e.reason) == str: error = (url, e.reason, e.reason) - #pyaggr3g470r_log.error(url + " " + e.reason) else: error = (url, e.reason.errno, e.reason.strerror) return (False, error) + def clear_string(data): """ Clear a string by removing HTML tags, HTML special caracters and consecutive white spaces (more that one). """ - p = re.compile('<[^>]+>') # HTML tags - q = re.compile('\s') # consecutive white spaces + p = re.compile('<[^>]+>') # HTML tags + q = re.compile('\s') # consecutive white spaces return p.sub('', q.sub(' ', data)) + def load_stop_words(): """ Load the stop words and return them in a list. diff --git a/pyaggr3g470r/views/api/article.py b/pyaggr3g470r/views/api/article.py index c3ec2d34..516eef8f 100644 --- a/pyaggr3g470r/views/api/article.py +++ b/pyaggr3g470r/views/api/article.py @@ -11,7 +11,8 @@ from pyaggr3g470r.views.api.common import PyAggAbstractResource,\ PyAggResourceMulti -ARTICLE_ATTRS = {'feed_id': {'type': str}, +ARTICLE_ATTRS = {'user_id': {'type': int}, + 'feed_id': {'type': int}, 'entry_id': {'type': str}, 'link': {'type': str}, 'title': {'type': str}, diff --git a/pyaggr3g470r/views/api/common.py b/pyaggr3g470r/views/api/common.py index b8477d4b..ca344c04 100644 --- a/pyaggr3g470r/views/api/common.py +++ b/pyaggr3g470r/views/api/common.py @@ -51,7 +51,8 @@ def authenticate(func): # authentication via HTTP only auth = request.authorization if auth is not None: - user = User.query.filter(User.nickname == auth.username).first() + user = User.query.filter( + User.nickname == auth.username).first() if user and user.check_password(auth.password) \ and user.activation_key == "": g.user = user @@ -61,6 +62,7 @@ def authenticate(func): raise Unauthorized({'WWWAuthenticate': 'Basic realm="Login Required"'}) return wrapper + def to_response(func): """Will cast results of func as a result, and try to extract a status_code for the Response object""" @@ -158,7 +160,8 @@ class PyAggResourceMulti(PyAggAbstractResource): return [res for res in self.controller.read().limit(limit)] if not limit: return [res for res in self.controller.read(**request.json).all()] - return [res for res in self.controller.read(**request.json).limit(limit)] + return [res + for res in self.controller.read(**request.json).limit(limit)] def post(self): """creating several objects. payload should be a list of dict. diff --git a/pyaggr3g470r/views/api/feed.py b/pyaggr3g470r/views/api/feed.py index 7d0e2862..ad185de9 100644 --- a/pyaggr3g470r/views/api/feed.py +++ b/pyaggr3g470r/views/api/feed.py @@ -3,8 +3,10 @@ from flask import g -from pyaggr3g470r.controllers.feed import FeedController, \ - DEFAULT_MAX_ERROR, DEFAULT_LIMIT +from pyaggr3g470r.controllers.feed import (FeedController, + DEFAULT_MAX_ERROR, + DEFAULT_LIMIT, + DEFAULT_REFRESH_RATE) from pyaggr3g470r.views.api.common import PyAggAbstractResource, \ PyAggResourceNew, \ @@ -41,11 +43,20 @@ class FetchableFeedAPI(PyAggAbstractResource): controller_cls = FeedController to_date = ['date', 'last_retrieved'] attrs = {'max_error': {'type': int, 'default': DEFAULT_MAX_ERROR}, - 'limit': {'type': int, 'default': DEFAULT_LIMIT}} + 'limit': {'type': int, 'default': DEFAULT_LIMIT}, + 'refresh_rate': {'type': int, 'default': DEFAULT_REFRESH_RATE}, + 'retreive_all': {'type': bool, 'default': False}} def get(self): - return [feed for feed in self.controller.list_fetchable( - **self.reqparse_args())] + args = self.reqparse_args() + if g.user.refresh_rate: + args['refresh_rate'] = g.user.refresh_rate + + dont_filter_by_user = args.pop('retreive_all') and g.user.is_admin() + + contr = self.controller_cls() if dont_filter_by_user \ + else self.controller + return [feed for feed in contr.list_fetchable(**args)] g.api.add_resource(FeedNewAPI, '/feed', endpoint='feed_new.json') g.api.add_resource(FeedAPI, '/feed/<int:obj_id>', endpoint='feed.json') diff --git a/pyaggr3g470r/views/article.py b/pyaggr3g470r/views/article.py index 08c92686..75360485 100644 --- a/pyaggr3g470r/views/article.py +++ b/pyaggr3g470r/views/article.py @@ -49,7 +49,7 @@ def article(article_id=None): next_article = article.source.articles[-1] return render_template('article.html', - head_title=utils.clear_string(article.title), + head_titles=[utils.clear_string(article.title)], article=article, previous_article=previous_article, next_article=next_article) diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py index 159dce64..bc75aa29 100644 --- a/pyaggr3g470r/views/feed.py +++ b/pyaggr3g470r/views/feed.py @@ -53,7 +53,7 @@ def feed(feed_id=None): elapsed = today - last_article return render_template('feed.html', - head_title=utils.clear_string(feed.title), + head_titles=[utils.clear_string(feed.title)], feed=feed, tag_cloud=tag_cloud, first_post_date=first_article, end_post_date=last_article, diff --git a/pyaggr3g470r/views/views.py b/pyaggr3g470r/views/views.py index 9f4ef0b7..f344ad61 100644 --- a/pyaggr3g470r/views/views.py +++ b/pyaggr3g470r/views/views.py @@ -228,15 +228,19 @@ def signup(): flash(gettext('Problem while sending activation email') + ': ' + str(e), 'danger') return redirect(url_for('home')) - flash(gettext('Your account has been created. Check your mail to confirm it.'), 'success') + flash(gettext('Your account has been created. ' + 'Check your mail to confirm it.'), 'success') return redirect(url_for('home')) return render_template('signup.html', form=form) -def render_home(filters=None, head_title='', page_to_render='home', **kwargs): +def render_home(filters=None, head_titles=None, + page_to_render='home', **kwargs): if filters is None: filters = {} + if head_titles is None: + head_titles = [] feed_contr = FeedController(g.user.id) arti_contr = ArticleController(g.user.id) feeds = {feed.id: feed.title for feed in feed_contr.read()} @@ -255,8 +259,7 @@ def render_home(filters=None, head_title='', page_to_render='home', **kwargs): filters['readed'] = filter_ == 'read' if feed_id: filters['feed_id'] = feed_id - head_title = "%s%s" % (feed_contr.get(id=feed_id).title, - (' - %s' % head_title) if head_title else '') + head_titles.append(feed_contr.get(id=feed_id).title) sort_param = {"feed": Article.title.desc(), "date": Article.date.desc(), @@ -286,7 +289,7 @@ def render_home(filters=None, head_title='', page_to_render='home', **kwargs): return render_template('home.html', gen_url=gen_url, feed_id=feed_id, filter_=filter_, limit=limit, feeds=feeds, unread=unread, articles=articles, in_error=in_error, - head_title=head_title, sort_=sort_, **kwargs) + head_titles=head_titles, sort_=sort_, **kwargs) @app.route('/') @@ -299,7 +302,7 @@ def home(): @app.route('/favorites') @login_required def favorites(): - return render_home({'like': True}, gettext('Favorites'), 'favorites') + return render_home({'like': True}, [gettext('Favorites')], 'favorites') @app.route('/search', methods=['GET']) @@ -319,7 +322,7 @@ def search(): filters['content__like'] = "%%%s%%" % query if len(filters) > 1: filters = {"__or__": filters} - return render_home(filters, "%s %s" % (gettext('Search:'), query), + return render_home(filters, ["%s %s" % (gettext('Search:'), query)], 'search', search_query=query, search_title=search_title, search_content=search_content) |