From b0e987fbafaa28226c54157fb11993079c5341e2 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Thu, 17 Nov 2016 08:30:06 +0100 Subject: cleaning the mess in the libs directories --- README.rst | 4 +- src/bootstrap.py | 4 + src/conf/conf.cfg-sample | 6 +- src/crawler/classic_crawler.py | 4 +- src/crawler/http_crawler.py | 251 --------------------------------- src/lib/__init__.py | 0 src/lib/article_utils.py | 186 +++++++++++++++++++++++++ src/lib/data.py | 162 +++++++++++++++++++++ src/lib/feed_utils.py | 123 ++++++++++++++++ src/lib/misc_utils.py | 185 ++++++++++++++++++++++++ src/lib/utils.py | 89 ++++++++++++ src/tests/__init__.py | 0 src/tests/base.py | 41 ------ src/tests/controllers/__init__.py | 5 - src/tests/controllers/article.py | 117 ---------------- src/tests/controllers/feed.py | 27 ---- src/tests/fixtures.py | 31 ----- src/web/controllers/article.py | 2 +- src/web/controllers/feed.py | 2 +- src/web/export.py | 58 -------- src/web/forms.py | 2 +- src/web/lib/article_utils.py | 186 ------------------------- src/web/lib/feed_utils.py | 123 ---------------- src/web/lib/misc_utils.py | 286 -------------------------------------- src/web/lib/utils.py | 89 ------------ src/web/lib/view_utils.py | 2 +- src/web/models/category.py | 5 +- src/web/models/icon.py | 3 + src/web/views/admin.py | 2 +- src/web/views/article.py | 4 +- src/web/views/category.py | 2 +- src/web/views/common.py | 2 +- src/web/views/feed.py | 4 +- src/web/views/home.py | 4 +- src/web/views/user.py | 7 +- 35 files changed, 779 insertions(+), 1239 deletions(-) delete mode 100644 src/crawler/http_crawler.py create mode 100644 src/lib/__init__.py create mode 100644 src/lib/article_utils.py create mode 100644 src/lib/data.py create mode 100644 src/lib/feed_utils.py create mode 100755 src/lib/misc_utils.py create mode 100644 src/lib/utils.py delete mode 100644 src/tests/__init__.py delete mode 100644 src/tests/base.py delete mode 100644 src/tests/controllers/__init__.py delete mode 100644 src/tests/controllers/article.py delete mode 100644 src/tests/controllers/feed.py delete mode 100644 src/tests/fixtures.py delete mode 100644 src/web/export.py delete mode 100644 src/web/lib/article_utils.py delete mode 100644 src/web/lib/feed_utils.py delete mode 100755 src/web/lib/misc_utils.py delete mode 100644 src/web/lib/utils.py diff --git a/README.rst b/README.rst index b674e391..a345320b 100644 --- a/README.rst +++ b/README.rst @@ -5,7 +5,7 @@ Newspipe Presentation ------------ -`Newspipe `_ is a web-based news +`Newspipe `_ is a web-based news aggregator and reader. Main features @@ -36,7 +36,7 @@ provides different ways to install Newspipe. License ------- -`Newspipe `_ is under the +`Newspipe `_ is under the `GNU Affero General Public License version 3 `_. Contact diff --git a/src/bootstrap.py b/src/bootstrap.py index f9de381a..5af29c69 100644 --- a/src/bootstrap.py +++ b/src/bootstrap.py @@ -18,6 +18,10 @@ def set_logging(log_path=None, log_level=logging.INFO, modules=(), if conf.ON_HEROKU: log_format = '%(levelname)s %(message)s' if log_path: + if not os.path.exists(os.path.dirname(log_path)): + os.makedirs(os.path.dirname(log_path)) + if not os.path.exists(log_path): + open(log_path, 'w').close() handler = logging.FileHandler(log_path) else: handler = logging.StreamHandler() diff --git a/src/conf/conf.cfg-sample b/src/conf/conf.cfg-sample index 6fae48b5..7c4668af 100644 --- a/src/conf/conf.cfg-sample +++ b/src/conf/conf.cfg-sample @@ -9,7 +9,7 @@ platform_url = http://127.0.0.1:5000/ admin_email = security_password_salt = a secret to confirm user account token_validity_period = 3600 -log_path = ./src/web/var/newspipe.log +log_path = ./var/newspipe.log nb_worker = 5 log_level = info [database] @@ -17,9 +17,7 @@ database_url = postgres://pgsqluser:pgsqlpwd@127.0.0.1:5432/aggregator [crawler] crawling_method = classic default_max_error = 6 -user_agent = Newspipe (https://github.com/Newspipe/Newspipe) -api_login = -api_passwd = +user_agent = Newspipe (https://github.com/newspipe/newspipe) timeout = 30 resolv = true feed_refresh_interval = 120 diff --git a/src/crawler/classic_crawler.py b/src/crawler/classic_crawler.py index eb75b78f..34726a83 100644 --- a/src/crawler/classic_crawler.py +++ b/src/crawler/classic_crawler.py @@ -37,8 +37,8 @@ import conf from bootstrap import db from web.models import User from web.controllers import FeedController, ArticleController -from web.lib.feed_utils import construct_feed_from, is_parsing_ok -from web.lib.article_utils import construct_article, extract_id, \ +from lib.feed_utils import construct_feed_from, is_parsing_ok +from lib.article_utils import construct_article, extract_id, \ get_article_content logger = logging.getLogger(__name__) diff --git a/src/crawler/http_crawler.py b/src/crawler/http_crawler.py deleted file mode 100644 index f480fe96..00000000 --- a/src/crawler/http_crawler.py +++ /dev/null @@ -1,251 +0,0 @@ -""" -Here's a sum up on how it works : - -CrawlerScheduler.run - will retreive a list of feeds to be refreshed and pass result to -CrawlerScheduler.callback - which will retreive each feed and treat result with -FeedCrawler.callback - which will interprete the result (status_code, etag) collect ids - and match them agaisnt pyagg which will cause -PyAggUpdater.callback - to create the missing entries -""" - -import time -import conf -import json -import logging -import feedparser -from datetime import datetime, timedelta -from time import strftime, gmtime -from concurrent.futures import ThreadPoolExecutor -from requests_futures.sessions import FuturesSession -from web.lib.utils import default_handler, to_hash -from web.lib.feed_utils import construct_feed_from -from web.lib.article_utils import extract_id, construct_article - -logger = logging.getLogger(__name__) -logging.captureWarnings(True) -API_ROOT = "api/v2.0/" - - -class AbstractCrawler: - - def __init__(self, auth, pool=None, session=None): - self.auth = auth - self.pool = pool or ThreadPoolExecutor(max_workers=conf.NB_WORKER) - self.session = session or FuturesSession(executor=self.pool) - self.session.verify = False - self.url = conf.PLATFORM_URL - - def query_pyagg(self, method, urn, data=None): - """A wrapper for internal call, method should be ones you can find - on requests (header, post, get, options, ...), urn the distant - resources you want to access on pyagg, and data, the data you wanna - transmit.""" - if data is None: - data = {} - method = getattr(self.session, method) - return method("%s%s%s" % (self.url, API_ROOT, urn), - auth=self.auth, data=json.dumps(data, - default=default_handler), - headers={'Content-Type': 'application/json', - 'User-Agent': conf.USER_AGENT}) - - def wait(self, max_wait=300, checks=5, wait_for=2): - checked, second_waited = 0, 0 - while True: - time.sleep(wait_for) - second_waited += wait_for - if second_waited > max_wait: - logger.warn('Exiting after %d seconds', second_waited) - break - if self.pool._work_queue.qsize(): - checked = 0 - continue - checked += 1 - if checked == checks: - break - - -class PyAggUpdater(AbstractCrawler): - - def __init__(self, feed, entries, headers, parsed_feed, - auth, pool=None, session=None): - self.feed = feed - self.entries = entries - self.headers = headers - self.parsed_feed = parsed_feed - super().__init__(auth, pool, session) - - def callback(self, response): - """Will process the result from the challenge, creating missing article - and updating the feed""" - article_created = False - if response.result().status_code != 204: - results = response.result().json() - logger.debug('%r %r - %d entries were not matched ' - 'and will be created', - self.feed['id'], self.feed['title'], len(results)) - for id_to_create in results: - article_created = True - entry = construct_article( - self.entries[tuple(sorted(id_to_create.items()))], - self.feed) - logger.info('%r %r - creating %r for %r - %r', self.feed['id'], - self.feed['title'], entry['title'], - entry['user_id'], id_to_create) - self.query_pyagg('post', 'article', entry) - - logger.debug('%r %r - updating feed etag %r last_mod %r', - self.feed['id'], self.feed['title'], - self.headers.get('etag', ''), - self.headers.get('last-modified', '')) - - up_feed = {'error_count': 0, 'last_error': None, - 'etag': self.headers.get('etag', ''), - 'last_modified': self.headers.get('last-modified', - strftime('%a, %d %b %Y %X %Z', gmtime()))} - fresh_feed = construct_feed_from(url=self.feed['link'], - fp_parsed=self.parsed_feed) - for key in ('description', 'site_link', 'icon_url'): - if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key): - up_feed[key] = fresh_feed[key] - if not self.feed.get('title'): - up_feed['title'] = fresh_feed.get('title', '') - up_feed['user_id'] = self.feed['user_id'] - # re-getting that feed earlier since new entries appeared - if article_created: - up_feed['last_retrieved'] \ - = (datetime.now() - timedelta(minutes=45)).isoformat() - - diff_keys = {key for key in up_feed - if up_feed[key] != self.feed.get(key)} - if not diff_keys: - return # no change in the feed, no update - if not article_created and diff_keys == {'last_modified', 'etag'}: - return # meaningless if no new article has been published - logger.info('%r %r - pushing feed attrs %r', - self.feed['id'], self.feed['title'], - {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) - for key in up_feed if up_feed[key] != self.feed.get(key)}) - - self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed) - - -class FeedCrawler(AbstractCrawler): - - def __init__(self, feed, auth, pool=None, session=None): - self.feed = feed - super().__init__(auth, pool, session) - - def clean_feed(self): - """Will reset the errors counters on a feed that have known errors""" - if self.feed.get('error_count') or self.feed.get('last_error'): - self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': 0, 'last_error': ''}) - - def callback(self, response): - """will fetch the feed and interprete results (304, etag) or will - challenge pyagg to compare gotten entries with existing ones""" - try: - response = response.result() - response.raise_for_status() - except Exception as error: - error_count = self.feed['error_count'] + 1 - logger.exception('%r %r - an error occured while fetching ' - 'feed; bumping error count to %r', - self.feed['id'], self.feed['title'], error_count) - future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': error_count, - 'last_error': str(error), - 'user_id': self.feed['user_id']}) - return - - if response.status_code == 304: - logger.info("%r %r - feed responded with 304", - self.feed['id'], self.feed['title']) - self.clean_feed() - return - if 'etag' not in response.headers: - logger.debug('%r %r - manually generating etag', - self.feed['id'], self.feed['title']) - response.headers['etag'] = 'pyagg/"%s"' % to_hash(response.text) - if response.headers['etag'] and self.feed['etag'] \ - and response.headers['etag'] == self.feed['etag']: - if 'pyagg' in self.feed['etag']: - logger.info("%r %r - calculated hash matches (%d)", - self.feed['id'], self.feed['title'], - response.status_code) - else: - logger.info("%r %r - feed responded with same etag (%d)", - self.feed['id'], self.feed['title'], - response.status_code) - self.clean_feed() - return - else: - logger.debug('%r %r - etag mismatch %r != %r', - self.feed['id'], self.feed['title'], - response.headers['etag'], self.feed['etag']) - logger.info('%r %r - cache validation failed, challenging entries', - self.feed['id'], self.feed['title']) - - ids, entries = [], {} - parsed_response = feedparser.parse(response.content) - for entry in parsed_response['entries']: - entry_ids = extract_id(entry) - entry_ids['feed_id'] = self.feed['id'] - entry_ids['user_id'] = self.feed['user_id'] - entries[tuple(sorted(entry_ids.items()))] = entry - ids.append(entry_ids) - logger.debug('%r %r - found %d entries %r', - self.feed['id'], self.feed['title'], len(ids), ids) - future = self.query_pyagg('get', 'articles/challenge', {'ids': ids}) - updater = PyAggUpdater(self.feed, entries, response.headers, - parsed_response, - self.auth, self.pool, self.session) - future.add_done_callback(updater.callback) - - -class CrawlerScheduler(AbstractCrawler): - - def __init__(self, username, password, pool=None, session=None): - self.auth = (username, password) - super(CrawlerScheduler, self).__init__(self.auth, pool, session) - - def prepare_headers(self, feed): - """For a known feed, will construct some header dictionnary""" - headers = {'User-Agent': conf.USER_AGENT} - if feed.get('last_modified'): - headers['If-Modified-Since'] = feed['last_modified'] - if feed.get('etag') and 'pyagg' not in feed['etag']: - headers['If-None-Match'] = feed['etag'] - logger.debug('%r %r - calculated headers %r', - feed['id'], feed['title'], headers) - return headers - - def callback(self, response): - """processes feeds that need to be fetched""" - response = response.result() - response.raise_for_status() - if response.status_code == 204: - logger.debug("No feed to fetch") - return - feeds = response.json() - logger.debug('%d to fetch %r', len(feeds), feeds) - for feed in feeds: - logger.debug('%r %r - fetching resources', - feed['id'], feed['title']) - future = self.session.get(feed['link'], - headers=self.prepare_headers(feed)) - - feed_crwlr = FeedCrawler(feed, self.auth, self.pool, self.session) - future.add_done_callback(feed_crwlr.callback) - - def run(self, **kwargs): - """entry point, will retreive feeds to be fetch - and launch the whole thing""" - logger.debug('retreving fetchable feed') - future = self.query_pyagg('get', 'feeds/fetchable', kwargs) - future.add_done_callback(self.callback) diff --git a/src/lib/__init__.py b/src/lib/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/lib/article_utils.py b/src/lib/article_utils.py new file mode 100644 index 00000000..49494e85 --- /dev/null +++ b/src/lib/article_utils.py @@ -0,0 +1,186 @@ +import html +import logging +import re +from datetime import datetime, timezone +from enum import Enum +from urllib.parse import SplitResult, urlsplit, urlunsplit + +import dateutil.parser +from bs4 import BeautifulSoup, SoupStrainer +from requests.exceptions import MissingSchema + +import conf +from lib.utils import jarr_get + +logger = logging.getLogger(__name__) +PROCESSED_DATE_KEYS = {'published', 'created', 'updated'} + + +def extract_id(entry): + """ extract a value from an entry that will identify it among the other of + that feed""" + return entry.get('entry_id') or entry.get('id') or entry['link'] + + +async def construct_article(entry, feed, fields=None, fetch=True): + "Safe method to transorm a feedparser entry into an article" + now = datetime.utcnow() + article = {} + def push_in_article(key, value): + if not fields or key in fields: + article[key] = value + push_in_article('feed_id', feed.id) + push_in_article('user_id', feed.user_id) + push_in_article('entry_id', extract_id(entry)) + push_in_article('retrieved_date', now) + if not fields or 'date' in fields: + for date_key in PROCESSED_DATE_KEYS: + if entry.get(date_key): + try: + article['date'] = dateutil.parser.parse(entry[date_key])\ + .astimezone(timezone.utc) + except Exception as e: + logger.exception(e) + else: + break + push_in_article('content', get_article_content(entry)) + if fields is None or {'link', 'title'}.intersection(fields): + link, title = await get_article_details(entry, fetch) + push_in_article('link', link) + push_in_article('title', title) + if 'content' in article: + #push_in_article('content', clean_urls(article['content'], link)) + push_in_article('content', article['content']) + push_in_article('tags', {tag.get('term').strip() + for tag in entry.get('tags', []) \ + if tag and tag.get('term', False)}) + return article + + +def get_article_content(entry): + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + return content + + +async def get_article_details(entry, fetch=True): + article_link = entry.get('link') + article_title = html.unescape(entry.get('title', '')) + if fetch and conf.CRAWLER_RESOLV and article_link or not article_title: + try: + # resolves URL behind proxies (like feedproxy.google.com) + response = await jarr_get(article_link, timeout=5) + except MissingSchema: + split, failed = urlsplit(article_link), False + for scheme in 'https', 'http': + new_link = urlunsplit(SplitResult(scheme, *split[1:])) + try: + response = await jarr_get(new_link, timeout=5) + except Exception as error: + failed = True + continue + failed = False + article_link = new_link + break + if failed: + return article_link, article_title or 'No title' + except Exception as error: + logger.info("Unable to get the real URL of %s. Won't fix " + "link or title. Error: %s", article_link, error) + return article_link, article_title or 'No title' + article_link = response.url + if not article_title: + bs_parsed = BeautifulSoup(response.content, 'html.parser', + parse_only=SoupStrainer('head')) + try: + article_title = bs_parsed.find_all('title')[0].text + except IndexError: # no title + pass + return article_link, article_title or 'No title' + + +class FiltersAction(Enum): + READ = 'mark as read' + LIKED = 'mark as favorite' + SKIP = 'skipped' + + +class FiltersType(Enum): + REGEX = 'regex' + MATCH = 'simple match' + EXACT_MATCH = 'exact match' + TAG_MATCH = 'tag match' + TAG_CONTAINS = 'tag contains' + + +class FiltersTrigger(Enum): + MATCH = 'match' + NO_MATCH = 'no match' + + +def process_filters(filters, article, only_actions=None): + skipped, read, liked = False, None, False + filters = filters or [] + if only_actions is None: + only_actions = set(FiltersAction) + for filter_ in filters: + match = False + try: + pattern = filter_.get('pattern', '') + filter_type = FiltersType(filter_.get('type')) + filter_action = FiltersAction(filter_.get('action')) + filter_trigger = FiltersTrigger(filter_.get('action on')) + if filter_type is not FiltersType.REGEX: + pattern = pattern.lower() + except ValueError: + continue + if filter_action not in only_actions: + logger.debug('ignoring filter %r' % filter_) + continue + if filter_action in {FiltersType.REGEX, FiltersType.MATCH, + FiltersType.EXACT_MATCH} and 'title' not in article: + continue + if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \ + and 'tags' not in article: + continue + title = article.get('title', '').lower() + tags = [tag.lower() for tag in article.get('tags', [])] + if filter_type is FiltersType.REGEX: + match = re.match(pattern, title) + elif filter_type is FiltersType.MATCH: + match = pattern in title + elif filter_type is FiltersType.EXACT_MATCH: + match = pattern == title + elif filter_type is FiltersType.TAG_MATCH: + match = pattern in tags + elif filter_type is FiltersType.TAG_CONTAINS: + match = any(pattern in tag for tag in tags) + take_action = match and filter_trigger is FiltersTrigger.MATCH \ + or not match and filter_trigger is FiltersTrigger.NO_MATCH + + if not take_action: + continue + + if filter_action is FiltersAction.READ: + read = True + elif filter_action is FiltersAction.LIKED: + liked = True + elif filter_action is FiltersAction.SKIP: + skipped = True + + if skipped or read or liked: + logger.info("%r applied on %r", filter_action.value, + article.get('link') or article.get('title')) + return skipped, read, liked + + +def get_skip_and_ids(entry, feed): + entry_ids = construct_article(entry, feed, + {'entry_id', 'feed_id', 'user_id'}, fetch=False) + skipped, _, _ = process_filters(feed.filters, + construct_article(entry, feed, {'title', 'tags'}, fetch=False), + {FiltersAction.SKIP}) + return skipped, entry_ids diff --git a/src/lib/data.py b/src/lib/data.py new file mode 100644 index 00000000..d887c003 --- /dev/null +++ b/src/lib/data.py @@ -0,0 +1,162 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +# Newspipe - A Web based news aggregator. +# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org +# +# For more information : https://github.com/newspipe/newspipe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 0.1 $" +__date__ = "$Date: 2016/11/17 $" +__revision__ = "$Date: 2016/11/17 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "AGPLv3" + +# +# This file contains the import/export functions of Newspipe. +# + +import json +import opml +import datetime +from flask import jsonify + +from bootstrap import db +from web.models import User, Feed, Article + + +def import_opml(email, opml_content): + """ + Import new feeds from an OPML file. + """ + user = User.query.filter(User.email == email).first() + try: + subscriptions = opml.from_string(opml_content) + except: + logger.exception("Parsing OPML file failed:") + raise + + def read(subsubscription, nb=0): + """ + Parse recursively through the categories and sub-categories. + """ + for subscription in subsubscription: + if len(subscription) != 0: + nb = read(subscription, nb) + else: + try: + title = subscription.text + except: + title = "" + try: + description = subscription.description + except: + description = "" + try: + link = subscription.xmlUrl + except: + continue + if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first(): + continue + try: + site_link = subscription.htmlUrl + except: + site_link = "" + new_feed = Feed(title=title, description=description, + link=link, site_link=site_link, + enabled=True) + user.feeds.append(new_feed) + nb += 1 + return nb + nb = read(subscriptions) + db.session.commit() + return nb + + +def import_json(email, json_content): + """ + Import an account from a JSON file. + """ + user = User.query.filter(User.email == email).first() + json_account = json.loads(json_content.decode("utf-8")) + nb_feeds, nb_articles = 0, 0 + # Create feeds: + for feed in json_account["result"]: + if None != Feed.query.filter(Feed.user_id == user.id, + Feed.link == feed["link"]).first(): + continue + new_feed = Feed(title=feed["title"], + description="", + link=feed["link"], + site_link=feed["site_link"], + created_date=datetime.datetime. + fromtimestamp(int(feed["created_date"])), + enabled=feed["enabled"]) + user.feeds.append(new_feed) + nb_feeds += 1 + db.session.commit() + # Create articles: + for feed in json_account["result"]: + user_feed = Feed.query.filter(Feed.user_id == user.id, + Feed.link == feed["link"]).first() + if None != user_feed: + for article in feed["articles"]: + if None == Article.query.filter(Article.user_id == user.id, + Article.feed_id == user_feed.id, + Article.link == article["link"]).first(): + new_article = Article(entry_id=article["link"], + link=article["link"], + title=article["title"], + content=article["content"], + readed=article["readed"], + like=article["like"], + retrieved_date=datetime.datetime. + fromtimestamp(int(article["retrieved_date"])), + date=datetime.datetime. + fromtimestamp(int(article["date"])), + user_id=user.id, + feed_id=user_feed.id) + user_feed.articles.append(new_article) + nb_articles += 1 + db.session.commit() + return nb_feeds, nb_articles + + +def export_json(user): + """ + Export all articles of user in JSON. + """ + result = [] + for feed in user.feeds: + result.append({ + "title": feed.title, + "description": feed.description, + "link": feed.link, + "site_link": feed.site_link, + "enabled": feed.enabled, + "created_date": feed.created_date.strftime('%s'), + "articles": [ { + "title": article.title, + "link": article.link, + "content": article.content, + "readed": article.readed, + "like": article.like, + "date": article.date.strftime('%s'), + "retrieved_date": article.retrieved_date.strftime('%s') + } for article in feed.articles ] + }) + return jsonify(result=result) diff --git a/src/lib/feed_utils.py b/src/lib/feed_utils.py new file mode 100644 index 00000000..492391aa --- /dev/null +++ b/src/lib/feed_utils.py @@ -0,0 +1,123 @@ +import html +import urllib +import logging +import requests +import feedparser +from conf import CRAWLER_USER_AGENT +from bs4 import BeautifulSoup, SoupStrainer + +from lib.utils import try_keys, try_get_icon_url, rebuild_url + +logger = logging.getLogger(__name__) +logging.captureWarnings(True) +ACCEPTED_MIMETYPES = ('application/rss+xml', 'application/rdf+xml', + 'application/atom+xml', 'application/xml', 'text/xml') + + +def is_parsing_ok(parsed_feed): + return parsed_feed['entries'] or not parsed_feed['bozo'] + + +def escape_keys(*keys): + def wrapper(func): + def metawrapper(*args, **kwargs): + result = func(*args, **kwargs) + for key in keys: + if key in result: + result[key] = html.unescape(result[key] or '') + return result + return metawrapper + return wrapper + + +@escape_keys('title', 'description') +def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): + requests_kwargs = {'headers': {'User-Agent': CRAWLER_USER_AGENT}, + 'verify': False} + if url is None and fp_parsed is not None: + url = fp_parsed.get('url') + if url is not None and fp_parsed is None: + try: + response = requests.get(url, **requests_kwargs) + fp_parsed = feedparser.parse(response.content, + request_headers=response.headers) + except Exception: + logger.exception('failed to retreive that url') + fp_parsed = {'bozo': True} + assert url is not None and fp_parsed is not None + feed = feed or {} + feed_split = urllib.parse.urlsplit(url) + site_split = None + if is_parsing_ok(fp_parsed): + feed['link'] = url + feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') + feed['title'] = fp_parsed['feed'].get('title') + feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') + feed['icon_url'] = try_keys(fp_parsed['feed'], 'icon') + else: + feed['site_link'] = url + + if feed.get('site_link'): + feed['site_link'] = rebuild_url(feed['site_link'], feed_split) + site_split = urllib.parse.urlsplit(feed['site_link']) + + if feed.get('icon_url'): + feed['icon_url'] = try_get_icon_url( + feed['icon_url'], site_split, feed_split) + if feed['icon_url'] is None: + del feed['icon_url'] + + if not feed.get('site_link') or not query_site \ + or all(bool(feed.get(k)) for k in ('link', 'title', 'icon_url')): + return feed + + try: + response = requests.get(feed['site_link'], **requests_kwargs) + except Exception: + logger.exception('failed to retreive %r', feed['site_link']) + return feed + bs_parsed = BeautifulSoup(response.content, 'html.parser', + parse_only=SoupStrainer('head')) + + if not feed.get('title'): + try: + feed['title'] = bs_parsed.find_all('title')[0].text + except Exception: + pass + + def check_keys(**kwargs): + def wrapper(elem): + for key, vals in kwargs.items(): + if not elem.has_attr(key): + return False + if not all(val in elem.attrs[key] for val in vals): + return False + return True + return wrapper + + if not feed.get('icon_url'): + icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) + if not len(icons): + icons = bs_parsed.find_all(check_keys(rel=['icon'])) + if len(icons) >= 1: + for icon in icons: + feed['icon_url'] = try_get_icon_url(icon.attrs['href'], + site_split, feed_split) + if feed['icon_url'] is not None: + break + + if feed.get('icon_url') is None: + feed['icon_url'] = try_get_icon_url('/favicon.ico', + site_split, feed_split) + if 'icon_url' in feed and feed['icon_url'] is None: + del feed['icon_url'] + + if not feed.get('link'): + for type_ in ACCEPTED_MIMETYPES: + alternates = bs_parsed.find_all(check_keys( + rel=['alternate'], type=[type_])) + if len(alternates) >= 1: + feed['link'] = rebuild_url(alternates[0].attrs['href'], + feed_split) + break + return feed diff --git a/src/lib/misc_utils.py b/src/lib/misc_utils.py new file mode 100755 index 00000000..d594c01e --- /dev/null +++ b/src/lib/misc_utils.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +# Newspipe - A Web based news aggregator. +# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org +# +# For more information : https://github.com/newspipe/newspipe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 1.9 $" +__date__ = "$Date: 2010/12/07 $" +__revision__ = "$Date: 2016/01/17 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "AGPLv3" + +import re +import os +import sys +import glob +import json +import logging +import operator +import urllib +import subprocess +import sqlalchemy +try: + from urlparse import urlparse, parse_qs, urlunparse +except: + from urllib.parse import urlparse, parse_qs, urlunparse, urljoin +from collections import Counter +from contextlib import contextmanager +from flask import request + +import conf +from web.controllers import ArticleController +from lib.utils import clear_string + +logger = logging.getLogger(__name__) + +ALLOWED_EXTENSIONS = set(['xml', 'opml', 'json']) + + +def is_safe_url(target): + """ + Ensures that a redirect target will lead to the same server. + """ + ref_url = urlparse(request.host_url) + test_url = urlparse(urljoin(request.host_url, target)) + return test_url.scheme in ('http', 'https') and \ + ref_url.netloc == test_url.netloc + + +def get_redirect_target(): + """ + Looks at various hints to find the redirect target. + """ + for target in request.args.get('next'), request.referrer: + if not target: + continue + if is_safe_url(target): + return target + + +def allowed_file(filename): + """ + Check if the uploaded file is allowed. + """ + return '.' in filename and \ + filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS + + +@contextmanager +def opened_w_error(filename, mode="r"): + try: + f = open(filename, mode) + except IOError as err: + yield None, err + else: + try: + yield f, None + finally: + f.close() + + +def fetch(id, feed_id=None): + """ + Fetch the feeds in a new processus. + The "asyncio" crawler is launched with the manager. + """ + cmd = [sys.executable, conf.BASE_DIR + '/manager.py', 'fetch_asyncio', + '--user_id='+str(id)] + if feed_id: + cmd.append('--feed_id='+str(feed_id)) + return subprocess.Popen(cmd, stdout=subprocess.PIPE) + + +def history(user_id, year=None, month=None): + """ + Sort articles by year and month. + """ + articles_counter = Counter() + articles = ArticleController(user_id).read() + if None != year: + articles = articles.filter(sqlalchemy.extract('year', 'Article.date') == year) + if None != month: + articles = articles.filter(sqlalchemy.extract('month', 'Article.date') == month) + for article in articles.all(): + if None != year: + articles_counter[article.date.month] += 1 + else: + articles_counter[article.date.year] += 1 + return articles_counter, articles + + +def clean_url(url): + """ + Remove utm_* parameters + """ + parsed_url = urlparse(url) + qd = parse_qs(parsed_url.query, keep_blank_values=True) + filtered = dict((k, v) for k, v in qd.items() + if not k.startswith('utm_')) + return urlunparse([ + parsed_url.scheme, + parsed_url.netloc, + urllib.parse.quote(urllib.parse.unquote(parsed_url.path)), + parsed_url.params, + urllib.parse.urlencode(filtered, doseq=True), + parsed_url.fragment + ]).rstrip('=') + + +def load_stop_words(): + """ + Load the stop words and return them in a list. + """ + stop_words_lists = glob.glob(os.path.join(conf.BASE_DIR, + 'web/var/stop_words/*.txt')) + stop_words = [] + + for stop_wods_list in stop_words_lists: + with opened_w_error(stop_wods_list, "r") as (stop_wods_file, err): + if err: + stop_words = [] + else: + stop_words += stop_wods_file.read().split(";") + return stop_words + + +def top_words(articles, n=10, size=5): + """ + Return the n most frequent words in a list. + """ + stop_words = load_stop_words() + words = Counter() + wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) + for article in articles: + for word in [elem.lower() for elem in + wordre.findall(clear_string(article.content)) \ + if elem.lower() not in stop_words]: + words[word] += 1 + return words.most_common(n) + + +def tag_cloud(tags): + """ + Generates a tags cloud. + """ + tags.sort(key=operator.itemgetter(0)) + return '\n'.join([('%s' % \ + (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word)) \ + for (word, count) in tags]) diff --git a/src/lib/utils.py b/src/lib/utils.py new file mode 100644 index 00000000..d206b769 --- /dev/null +++ b/src/lib/utils.py @@ -0,0 +1,89 @@ +import re +import types +import urllib +import logging +import requests +from hashlib import md5 +from flask import request, url_for + +import conf + +logger = logging.getLogger(__name__) + + +def default_handler(obj, role='admin'): + """JSON handler for default query formatting""" + if hasattr(obj, 'isoformat'): + return obj.isoformat() + if hasattr(obj, 'dump'): + return obj.dump(role=role) + if isinstance(obj, (set, frozenset, types.GeneratorType)): + return list(obj) + if isinstance(obj, BaseException): + return str(obj) + raise TypeError("Object of type %s with value of %r " + "is not JSON serializable" % (type(obj), obj)) + + +def try_keys(dico, *keys): + for key in keys: + if key in dico: + return dico[key] + return + + +def rebuild_url(url, base_split): + split = urllib.parse.urlsplit(url) + if split.scheme and split.netloc: + return url # url is fine + new_split = urllib.parse.SplitResult( + scheme=split.scheme or base_split.scheme, + netloc=split.netloc or base_split.netloc, + path=split.path, query='', fragment='') + return urllib.parse.urlunsplit(new_split) + + +def try_get_icon_url(url, *splits): + for split in splits: + if split is None: + continue + rb_url = rebuild_url(url, split) + response = None + # if html in content-type, we assume it's a fancy 404 page + try: + response = jarr_get(rb_url) + content_type = response.headers.get('content-type', '') + except Exception: + pass + else: + if response is not None and response.ok \ + and 'html' not in content_type and response.content: + return response.url + return None + + +def to_hash(text): + return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\ + .hexdigest() + + +def clear_string(data): + """ + Clear a string by removing HTML tags, HTML special caracters + and consecutive white spaces (more that one). + """ + p = re.compile('<[^>]+>') # HTML tags + q = re.compile('\s') # consecutive white spaces + return p.sub('', q.sub(' ', data)) + + +def redirect_url(default='home'): + return request.args.get('next') or request.referrer or url_for(default) + + +async def jarr_get(url, **kwargs): + request_kwargs = {'verify': False, 'allow_redirects': True, + 'timeout': conf.CRAWLER_TIMEOUT, + 'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}} + request_kwargs.update(kwargs) + return requests.get(url, **request_kwargs) diff --git a/src/tests/__init__.py b/src/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/src/tests/base.py b/src/tests/base.py deleted file mode 100644 index d6f62583..00000000 --- a/src/tests/base.py +++ /dev/null @@ -1,41 +0,0 @@ -import os -os.environ['PYAGG_TESTING'] = 'true' - -import unittest -from bootstrap import db -import runserver -from tests.fixtures import populate_db, reset_db -from werkzeug.exceptions import NotFound - - -class BasePyaggTest(unittest.TestCase): - _contr_cls = None - - def _get_from_contr(self, obj_id, user_id=None): - return self._contr_cls(user_id).get(id=obj_id).dump() - - def _test_controller_rights(self, obj, user_id): - obj_id = obj['id'] - self.assertEquals(obj, self._get_from_contr(obj_id)) - self.assertEquals(obj, self._get_from_contr(obj_id, user_id)) - # fetching non existent object - self.assertRaises(NotFound, self._get_from_contr, 99, user_id) - # fetching object with inexistent user - self.assertRaises(NotFound, self._get_from_contr, obj_id, 99) - # fetching object with wrong user - self.assertRaises(NotFound, self._get_from_contr, obj_id, user_id + 1) - self.assertRaises(NotFound, self._contr_cls().delete, 99) - self.assertRaises(NotFound, self._contr_cls(user_id).delete, 99) - self.assertEquals(obj['id'], - self._contr_cls(user_id).delete(obj_id).id) - self.assertRaises(NotFound, self._contr_cls(user_id).delete, obj_id) - - def setUp(self): - populate_db(db) - - def tearDown(self): - reset_db(db) - - -if __name__ == '__main__': - unittest.main() diff --git a/src/tests/controllers/__init__.py b/src/tests/controllers/__init__.py deleted file mode 100644 index 26922c43..00000000 --- a/src/tests/controllers/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from tests.controllers.feed import FeedControllerTest -from tests.controllers.article import ArticleControllerTest - - -__all__ = ['FeedControllerTest', 'ArticleControllerTest'] diff --git a/src/tests/controllers/article.py b/src/tests/controllers/article.py deleted file mode 100644 index a62d1a83..00000000 --- a/src/tests/controllers/article.py +++ /dev/null @@ -1,117 +0,0 @@ -from tests.base import BasePyaggTest -from web.controllers import ArticleController -from web.controllers import FeedController - - -class ArticleControllerTest(BasePyaggTest): - _contr_cls = ArticleController - - def test_article_rights(self): - article = ArticleController(2).read()[0].dump() - self.assertFalse(article['readed']) - article['readed'] = True # article get read when retreived through get - self._test_controller_rights(article, article['user_id']) - - def test_article_challange_method(self): - self.assertEquals(0, len(list(ArticleController().challenge( - [{'id': art.id} for art in ArticleController(3).read()])))) - self.assertEquals(9, len(list(ArticleController(2).challenge( - [{'id': art.id} for art in ArticleController(3).read()])))) - self.assertEquals(9, len(list(ArticleController(2).challenge( - [{'entry_id': art.id} for art in ArticleController(3).read()] - )))) - - def test_article_get_unread(self): - self.assertEquals({1: 3, 2: 3, 3: 3}, - ArticleController(2).count_by_feed(readed=False)) - self.assertEquals({4: 3, 5: 3, 6: 3}, - ArticleController(3).count_by_feed(readed=False)) - - def test_create_using_filters(self): - feed_ctr = FeedController(2) - feed1 = feed_ctr.read()[0].dump() - feed2 = feed_ctr.read()[1].dump() - feed3 = feed_ctr.read()[2].dump() - feed_ctr.update({'id': feed1['id']}, - {'filters': [{"type": "simple match", - "pattern": "no see pattern", - "action on": "match", - "action": "mark as read"}]}) - feed_ctr.update({'id': feed3['id']}, - {'filters': [{"type": "regex", - "pattern": ".*(pattern1|pattern2).*", - "action on": "no match", - "action": "mark as favorite"}, - {"type": "simple match", - "pattern": "no see pattern", - "action on": "match", - "action": "mark as read"}]}) - art1 = ArticleController(2).create( - entry_id="thisisnotatest", - feed_id=feed1['id'], - title="garbage no see pattern garbage", - content="doesn't matter", - link="doesn't matter either") - art2 = ArticleController(2).create( - entry_id="thisisnotatesteither", - feed_id=feed1['id'], - title="garbage see pattern garbage", - content="doesn't matter2", - link="doesn't matter either2") - - art3 = ArticleController(2).create( - entry_id="thisisnotatest", - user_id=2, - feed_id=feed2['id'], - title="garbage no see pattern garbage", - content="doesn't matter", - link="doesn't matter either") - art4 = ArticleController(2).create( - entry_id="thisisnotatesteither", - user_id=2, - feed_id=feed2['id'], - title="garbage see pattern garbage", - content="doesn't matter2", - link="doesn't matter either2") - - art5 = ArticleController(2).create( - entry_id="thisisnotatest", - feed_id=feed3['id'], - title="garbage pattern1 garbage", - content="doesn't matter", - link="doesn't matter either") - art6 = ArticleController(2).create( - entry_id="thisisnotatesteither", - feed_id=feed3['id'], - title="garbage pattern2 garbage", - content="doesn't matter2", - link="doesn't matter either2") - art7 = ArticleController(2).create( - entry_id="thisisnotatesteither", - feed_id=feed3['id'], - title="garbage no see pattern3 garbage", - content="doesn't matter3", - link="doesn't matter either3") - art8 = ArticleController(2).create( - entry_id="thisisnotatesteither", - feed_id=feed3['id'], - title="garbage pattern4 garbage", - content="doesn't matter4", - link="doesn't matter either4") - - self.assertTrue(art1.readed) - self.assertFalse(art1.like) - self.assertFalse(art2.readed) - self.assertFalse(art2.like) - self.assertFalse(art3.readed) - self.assertFalse(art3.like) - self.assertFalse(art4.readed) - self.assertFalse(art4.like) - self.assertFalse(art5.readed) - self.assertFalse(art5.like) - self.assertFalse(art6.readed) - self.assertFalse(art6.like) - self.assertTrue(art7.readed) - self.assertTrue(art7.like) - self.assertFalse(art8.readed) - self.assertTrue(art8.like) diff --git a/src/tests/controllers/feed.py b/src/tests/controllers/feed.py deleted file mode 100644 index 7dd77295..00000000 --- a/src/tests/controllers/feed.py +++ /dev/null @@ -1,27 +0,0 @@ -from tests.base import BasePyaggTest -from web.controllers import FeedController -from web.controllers import ArticleController - - -class FeedControllerTest(BasePyaggTest): - _contr_cls = FeedController - - def test_feed_rights(self): - feed = FeedController(2).read()[0].dump() - self.assertTrue(3, - ArticleController().read(feed_id=feed['id']).count()) - self._test_controller_rights(feed, feed['user_id']) - # checking articles are deleted after the feed has been deleted - - def test_feed_article_deletion(self): - feed_ctr = FeedController(2) - feed = feed_ctr.read()[0].dump() - feed_ctr.delete(feed['id']) - self.assertFalse(0, - ArticleController().read(feed_id=feed['id']).count()) - - def test_feed_list_fetchable(self): - self.assertEquals(3, len(FeedController(3).list_fetchable())) - self.assertEquals(0, len(FeedController(3).list_fetchable())) - self.assertEquals(3, len(FeedController().list_fetchable())) - self.assertEquals(0, len(FeedController().list_fetchable())) diff --git a/src/tests/fixtures.py b/src/tests/fixtures.py deleted file mode 100644 index 16a9cb81..00000000 --- a/src/tests/fixtures.py +++ /dev/null @@ -1,31 +0,0 @@ -from web.models import db_create, db_empty, User, Article, Feed - - -def populate_db(db): - role_admin, role_user = db_create(db) - user1, user2 = [User(nickname=name, email="%s@test.te" % name, - pwdhash=name, roles=[role_user], enabled=True) - for name in ["user1", "user2"]] - db.session.add(user1) - db.session.add(user2) - db.session.commit() - - for user in (user1, user2): - for feed_name in ['feed1', 'feed2', 'feed3']: - feed = Feed(link=feed_name, user_id=user.id, - title="%r %r" % (user.nickname, feed_name)) - db.session.add(feed) - db.session.commit() - for article in ['article1', 'article2', 'article3']: - entry = "%s %s %s" % (user.nickname, feed.title, article) - article = Article(entry_id=entry, link=article, - feed_id=feed.id, user_id=user.id, - title=entry, content=article) - db.session.add(article) - db.session.commit() - - db.session.commit() - - -def reset_db(db): - db_empty(db) diff --git a/src/web/controllers/article.py b/src/web/controllers/article.py index 4607b225..d7058229 100644 --- a/src/web/controllers/article.py +++ b/src/web/controllers/article.py @@ -6,7 +6,7 @@ from collections import Counter from bootstrap import db from .abstract import AbstractController -from web.lib.article_utils import process_filters +from lib.article_utils import process_filters from web.controllers import CategoryController, FeedController from web.models import Article diff --git a/src/web/controllers/feed.py b/src/web/controllers/feed.py index 7203c37e..a77fd926 100644 --- a/src/web/controllers/feed.py +++ b/src/web/controllers/feed.py @@ -6,7 +6,7 @@ import conf from .abstract import AbstractController from .icon import IconController from web.models import User, Feed -from web.lib.utils import clear_string +from lib.utils import clear_string logger = logging.getLogger(__name__) DEFAULT_LIMIT = 5 diff --git a/src/web/export.py b/src/web/export.py deleted file mode 100644 index 98473c9e..00000000 --- a/src/web/export.py +++ /dev/null @@ -1,58 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# Newspipe - A Web based news aggregator. -# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org -# -# For more information : https://github.com/Newspipe/Newspipe -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.7 $" -__date__ = "$Date: 2011/10/24 $" -__revision__ = "$Date: 2016/10/06 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "AGPLv3" - -# -# This file contains the export functions of newspipe. -# - -from flask import jsonify - -def export_json(user): - """ - Export all articles of user in JSON. - """ - result = [] - for feed in user.feeds: - result.append({ - "title": feed.title, - "description": feed.description, - "link": feed.link, - "site_link": feed.site_link, - "enabled": feed.enabled, - "created_date": feed.created_date.strftime('%s'), - "articles": [ { - "title": article.title, - "link": article.link, - "content": article.content, - "readed": article.readed, - "like": article.like, - "date": article.date.strftime('%s'), - "retrieved_date": article.retrieved_date.strftime('%s') - } for article in feed.articles ] - }) - return jsonify(result=result) diff --git a/src/web/forms.py b/src/web/forms.py index be1650d8..8088f27b 100644 --- a/src/web/forms.py +++ b/src/web/forms.py @@ -34,7 +34,7 @@ from wtforms import TextField, TextAreaField, PasswordField, BooleanField, \ SubmitField, IntegerField, SelectField, validators, HiddenField from wtforms.fields.html5 import EmailField, URLField -from web.lib import misc_utils +from lib import misc_utils from web.controllers import UserController from web.models import User diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py deleted file mode 100644 index 2c5ea8c3..00000000 --- a/src/web/lib/article_utils.py +++ /dev/null @@ -1,186 +0,0 @@ -import html -import logging -import re -from datetime import datetime, timezone -from enum import Enum -from urllib.parse import SplitResult, urlsplit, urlunsplit - -import dateutil.parser -from bs4 import BeautifulSoup, SoupStrainer -from requests.exceptions import MissingSchema - -import conf -from web.lib.utils import jarr_get - -logger = logging.getLogger(__name__) -PROCESSED_DATE_KEYS = {'published', 'created', 'updated'} - - -def extract_id(entry): - """ extract a value from an entry that will identify it among the other of - that feed""" - return entry.get('entry_id') or entry.get('id') or entry['link'] - - -async def construct_article(entry, feed, fields=None, fetch=True): - "Safe method to transorm a feedparser entry into an article" - now = datetime.utcnow() - article = {} - def push_in_article(key, value): - if not fields or key in fields: - article[key] = value - push_in_article('feed_id', feed.id) - push_in_article('user_id', feed.user_id) - push_in_article('entry_id', extract_id(entry)) - push_in_article('retrieved_date', now) - if not fields or 'date' in fields: - for date_key in PROCESSED_DATE_KEYS: - if entry.get(date_key): - try: - article['date'] = dateutil.parser.parse(entry[date_key])\ - .astimezone(timezone.utc) - except Exception as e: - logger.exception(e) - else: - break - push_in_article('content', get_article_content(entry)) - if fields is None or {'link', 'title'}.intersection(fields): - link, title = await get_article_details(entry, fetch) - push_in_article('link', link) - push_in_article('title', title) - if 'content' in article: - #push_in_article('content', clean_urls(article['content'], link)) - push_in_article('content', article['content']) - push_in_article('tags', {tag.get('term').strip() - for tag in entry.get('tags', []) \ - if tag and tag.get('term', False)}) - return article - - -def get_article_content(entry): - content = '' - if entry.get('content'): - content = entry['content'][0]['value'] - elif entry.get('summary'): - content = entry['summary'] - return content - - -async def get_article_details(entry, fetch=True): - article_link = entry.get('link') - article_title = html.unescape(entry.get('title', '')) - if fetch and conf.CRAWLER_RESOLV and article_link or not article_title: - try: - # resolves URL behind proxies (like feedproxy.google.com) - response = await jarr_get(article_link, timeout=5) - except MissingSchema: - split, failed = urlsplit(article_link), False - for scheme in 'https', 'http': - new_link = urlunsplit(SplitResult(scheme, *split[1:])) - try: - response = await jarr_get(new_link, timeout=5) - except Exception as error: - failed = True - continue - failed = False - article_link = new_link - break - if failed: - return article_link, article_title or 'No title' - except Exception as error: - logger.info("Unable to get the real URL of %s. Won't fix " - "link or title. Error: %s", article_link, error) - return article_link, article_title or 'No title' - article_link = response.url - if not article_title: - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - try: - article_title = bs_parsed.find_all('title')[0].text - except IndexError: # no title - pass - return article_link, article_title or 'No title' - - -class FiltersAction(Enum): - READ = 'mark as read' - LIKED = 'mark as favorite' - SKIP = 'skipped' - - -class FiltersType(Enum): - REGEX = 'regex' - MATCH = 'simple match' - EXACT_MATCH = 'exact match' - TAG_MATCH = 'tag match' - TAG_CONTAINS = 'tag contains' - - -class FiltersTrigger(Enum): - MATCH = 'match' - NO_MATCH = 'no match' - - -def process_filters(filters, article, only_actions=None): - skipped, read, liked = False, None, False - filters = filters or [] - if only_actions is None: - only_actions = set(FiltersAction) - for filter_ in filters: - match = False - try: - pattern = filter_.get('pattern', '') - filter_type = FiltersType(filter_.get('type')) - filter_action = FiltersAction(filter_.get('action')) - filter_trigger = FiltersTrigger(filter_.get('action on')) - if filter_type is not FiltersType.REGEX: - pattern = pattern.lower() - except ValueError: - continue - if filter_action not in only_actions: - logger.debug('ignoring filter %r' % filter_) - continue - if filter_action in {FiltersType.REGEX, FiltersType.MATCH, - FiltersType.EXACT_MATCH} and 'title' not in article: - continue - if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \ - and 'tags' not in article: - continue - title = article.get('title', '').lower() - tags = [tag.lower() for tag in article.get('tags', [])] - if filter_type is FiltersType.REGEX: - match = re.match(pattern, title) - elif filter_type is FiltersType.MATCH: - match = pattern in title - elif filter_type is FiltersType.EXACT_MATCH: - match = pattern == title - elif filter_type is FiltersType.TAG_MATCH: - match = pattern in tags - elif filter_type is FiltersType.TAG_CONTAINS: - match = any(pattern in tag for tag in tags) - take_action = match and filter_trigger is FiltersTrigger.MATCH \ - or not match and filter_trigger is FiltersTrigger.NO_MATCH - - if not take_action: - continue - - if filter_action is FiltersAction.READ: - read = True - elif filter_action is FiltersAction.LIKED: - liked = True - elif filter_action is FiltersAction.SKIP: - skipped = True - - if skipped or read or liked: - logger.info("%r applied on %r", filter_action.value, - article.get('link') or article.get('title')) - return skipped, read, liked - - -def get_skip_and_ids(entry, feed): - entry_ids = construct_article(entry, feed, - {'entry_id', 'feed_id', 'user_id'}, fetch=False) - skipped, _, _ = process_filters(feed.filters, - construct_article(entry, feed, {'title', 'tags'}, fetch=False), - {FiltersAction.SKIP}) - return skipped, entry_ids diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py deleted file mode 100644 index ef5d4f08..00000000 --- a/src/web/lib/feed_utils.py +++ /dev/null @@ -1,123 +0,0 @@ -import html -import urllib -import logging -import requests -import feedparser -from conf import CRAWLER_USER_AGENT -from bs4 import BeautifulSoup, SoupStrainer - -from web.lib.utils import try_keys, try_get_icon_url, rebuild_url - -logger = logging.getLogger(__name__) -logging.captureWarnings(True) -ACCEPTED_MIMETYPES = ('application/rss+xml', 'application/rdf+xml', - 'application/atom+xml', 'application/xml', 'text/xml') - - -def is_parsing_ok(parsed_feed): - return parsed_feed['entries'] or not parsed_feed['bozo'] - - -def escape_keys(*keys): - def wrapper(func): - def metawrapper(*args, **kwargs): - result = func(*args, **kwargs) - for key in keys: - if key in result: - result[key] = html.unescape(result[key] or '') - return result - return metawrapper - return wrapper - - -@escape_keys('title', 'description') -def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): - requests_kwargs = {'headers': {'User-Agent': CRAWLER_USER_AGENT}, - 'verify': False} - if url is None and fp_parsed is not None: - url = fp_parsed.get('url') - if url is not None and fp_parsed is None: - try: - response = requests.get(url, **requests_kwargs) - fp_parsed = feedparser.parse(response.content, - request_headers=response.headers) - except Exception: - logger.exception('failed to retreive that url') - fp_parsed = {'bozo': True} - assert url is not None and fp_parsed is not None - feed = feed or {} - feed_split = urllib.parse.urlsplit(url) - site_split = None - if is_parsing_ok(fp_parsed): - feed['link'] = url - feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') - feed['title'] = fp_parsed['feed'].get('title') - feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') - feed['icon_url'] = try_keys(fp_parsed['feed'], 'icon') - else: - feed['site_link'] = url - - if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], feed_split) - site_split = urllib.parse.urlsplit(feed['site_link']) - - if feed.get('icon_url'): - feed['icon_url'] = try_get_icon_url( - feed['icon_url'], site_split, feed_split) - if feed['icon_url'] is None: - del feed['icon_url'] - - if not feed.get('site_link') or not query_site \ - or all(bool(feed.get(k)) for k in ('link', 'title', 'icon_url')): - return feed - - try: - response = requests.get(feed['site_link'], **requests_kwargs) - except Exception: - logger.exception('failed to retreive %r', feed['site_link']) - return feed - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - - if not feed.get('title'): - try: - feed['title'] = bs_parsed.find_all('title')[0].text - except Exception: - pass - - def check_keys(**kwargs): - def wrapper(elem): - for key, vals in kwargs.items(): - if not elem.has_attr(key): - return False - if not all(val in elem.attrs[key] for val in vals): - return False - return True - return wrapper - - if not feed.get('icon_url'): - icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) - if not len(icons): - icons = bs_parsed.find_all(check_keys(rel=['icon'])) - if len(icons) >= 1: - for icon in icons: - feed['icon_url'] = try_get_icon_url(icon.attrs['href'], - site_split, feed_split) - if feed['icon_url'] is not None: - break - - if feed.get('icon_url') is None: - feed['icon_url'] = try_get_icon_url('/favicon.ico', - site_split, feed_split) - if 'icon_url' in feed and feed['icon_url'] is None: - del feed['icon_url'] - - if not feed.get('link'): - for type_ in ACCEPTED_MIMETYPES: - alternates = bs_parsed.find_all(check_keys( - rel=['alternate'], type=[type_])) - if len(alternates) >= 1: - feed['link'] = rebuild_url(alternates[0].attrs['href'], - feed_split) - break - return feed diff --git a/src/web/lib/misc_utils.py b/src/web/lib/misc_utils.py deleted file mode 100755 index 6a0e00ec..00000000 --- a/src/web/lib/misc_utils.py +++ /dev/null @@ -1,286 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# Newspipe - A Web based news aggregator. -# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org -# -# For more information : https://github.com/Newspipe/Newspipe -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see . - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.8 $" -__date__ = "$Date: 2010/12/07 $" -__revision__ = "$Date: 2016/04/10 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "AGPLv3" - -# -# This file provides functions used for: -# - import from a JSON file; -# - generation of tags cloud; -# - HTML processing. -# - -import re -import os -import sys -import glob -import opml -import json -import logging -import datetime -import operator -import urllib -import subprocess -import sqlalchemy -try: - from urlparse import urlparse, parse_qs, urlunparse -except: - from urllib.parse import urlparse, parse_qs, urlunparse, urljoin -from bs4 import BeautifulSoup -from collections import Counter -from contextlib import contextmanager -from flask import request - -import conf -from bootstrap import db -from web import controllers -from web.models import User, Feed, Article -from web.lib.utils import clear_string - -logger = logging.getLogger(__name__) - -ALLOWED_EXTENSIONS = set(['xml', 'opml', 'json']) - -def is_safe_url(target): - """ - Ensures that a redirect target will lead to the same server. - """ - ref_url = urlparse(request.host_url) - test_url = urlparse(urljoin(request.host_url, target)) - return test_url.scheme in ('http', 'https') and \ - ref_url.netloc == test_url.netloc - -def get_redirect_target(): - """ - Looks at various hints to find the redirect target. - """ - for target in request.args.get('next'), request.referrer: - if not target: - continue - if is_safe_url(target): - return target - -def allowed_file(filename): - """ - Check if the uploaded file is allowed. - """ - return '.' in filename and \ - filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS - -@contextmanager -def opened_w_error(filename, mode="r"): - try: - f = open(filename, mode) - except IOError as err: - yield None, err - else: - try: - yield f, None - finally: - f.close() - -def fetch(id, feed_id=None): - """ - Fetch the feeds in a new processus. - The "asyncio" crawler is launched with the manager. - """ - cmd = [sys.executable, conf.BASE_DIR + '/manager.py', 'fetch_asyncio', - '--user_id='+str(id)] - if feed_id: - cmd.append('--feed_id='+str(feed_id)) - return subprocess.Popen(cmd, stdout=subprocess.PIPE) - -def history(user_id, year=None, month=None): - """ - Sort articles by year and month. - """ - articles_counter = Counter() - articles = controllers.ArticleController(user_id).read() - if None != year: - articles = articles.filter(sqlalchemy.extract('year', Article.date) == year) - if None != month: - articles = articles.filter(sqlalchemy.extract('month', Article.date) == month) - for article in articles.all(): - if None != year: - articles_counter[article.date.month] += 1 - else: - articles_counter[article.date.year] += 1 - return articles_counter, articles - -def import_opml(email, opml_content): - """ - Import new feeds from an OPML file. - """ - user = User.query.filter(User.email == email).first() - try: - subscriptions = opml.from_string(opml_content) - except: - logger.exception("Parsing OPML file failed:") - raise - - def read(subsubscription, nb=0): - """ - Parse recursively through the categories and sub-categories. - """ - for subscription in subsubscription: - if len(subscription) != 0: - nb = read(subscription, nb) - else: - try: - title = subscription.text - except: - title = "" - try: - description = subscription.description - except: - description = "" - try: - link = subscription.xmlUrl - except: - continue - if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first(): - continue - try: - site_link = subscription.htmlUrl - except: - site_link = "" - new_feed = Feed(title=title, description=description, - link=link, site_link=site_link, - enabled=True) - user.feeds.append(new_feed) - nb += 1 - return nb - nb = read(subscriptions) - db.session.commit() - return nb - -def import_json(email, json_content): - """ - Import an account from a JSON file. - """ - user = User.query.filter(User.email == email).first() - json_account = json.loads(json_content.decode("utf-8")) - nb_feeds, nb_articles = 0, 0 - # Create feeds: - for feed in json_account["result"]: - if None != Feed.query.filter(Feed.user_id == user.id, - Feed.link == feed["link"]).first(): - continue - new_feed = Feed(title=feed["title"], - description="", - link=feed["link"], - site_link=feed["site_link"], - created_date=datetime.datetime. - fromtimestamp(int(feed["created_date"])), - enabled=feed["enabled"]) - user.feeds.append(new_feed) - nb_feeds += 1 - db.session.commit() - # Create articles: - for feed in json_account["result"]: - user_feed = Feed.query.filter(Feed.user_id == user.id, - Feed.link == feed["link"]).first() - if None != user_feed: - for article in feed["articles"]: - if None == Article.query.filter(Article.user_id == user.id, - Article.feed_id == user_feed.id, - Article.link == article["link"]).first(): - new_article = Article(entry_id=article["link"], - link=article["link"], - title=article["title"], - content=article["content"], - readed=article["readed"], - like=article["like"], - retrieved_date=datetime.datetime. - fromtimestamp(int(article["retrieved_date"])), - date=datetime.datetime. - fromtimestamp(int(article["date"])), - user_id=user.id, - feed_id=user_feed.id) - user_feed.articles.append(new_article) - nb_articles += 1 - db.session.commit() - return nb_feeds, nb_articles - -def clean_url(url): - """ - Remove utm_* parameters - """ - parsed_url = urlparse(url) - qd = parse_qs(parsed_url.query, keep_blank_values=True) - filtered = dict((k, v) for k, v in qd.items() - if not k.startswith('utm_')) - return urlunparse([ - parsed_url.scheme, - parsed_url.netloc, - urllib.parse.quote(urllib.parse.unquote(parsed_url.path)), - parsed_url.params, - urllib.parse.urlencode(filtered, doseq=True), - parsed_url.fragment - ]).rstrip('=') - -def load_stop_words(): - """ - Load the stop words and return them in a list. - """ - stop_words_lists = glob.glob(os.path.join(conf.BASE_DIR, - 'web/var/stop_words/*.txt')) - stop_words = [] - - for stop_wods_list in stop_words_lists: - with opened_w_error(stop_wods_list, "r") as (stop_wods_file, err): - if err: - stop_words = [] - else: - stop_words += stop_wods_file.read().split(";") - return stop_words - -def top_words(articles, n=10, size=5): - """ - Return the n most frequent words in a list. - """ - stop_words = load_stop_words() - words = Counter() - wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) - for article in articles: - for word in [elem.lower() for elem in - wordre.findall(clear_string(article.content)) \ - if elem.lower() not in stop_words]: - words[word] += 1 - return words.most_common(n) - -def tag_cloud(tags): - """ - Generates a tags cloud. - """ - tags.sort(key=operator.itemgetter(0)) - return '\n'.join([('%s' % \ - (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word)) \ - for (word, count) in tags]) - -if __name__ == "__main__": - import_opml("root@newspipe.localhost", "./var/feeds_test.opml") - #import_opml("root@newspipe.localhost", "./var/Newspipe.opml") diff --git a/src/web/lib/utils.py b/src/web/lib/utils.py deleted file mode 100644 index d206b769..00000000 --- a/src/web/lib/utils.py +++ /dev/null @@ -1,89 +0,0 @@ -import re -import types -import urllib -import logging -import requests -from hashlib import md5 -from flask import request, url_for - -import conf - -logger = logging.getLogger(__name__) - - -def default_handler(obj, role='admin'): - """JSON handler for default query formatting""" - if hasattr(obj, 'isoformat'): - return obj.isoformat() - if hasattr(obj, 'dump'): - return obj.dump(role=role) - if isinstance(obj, (set, frozenset, types.GeneratorType)): - return list(obj) - if isinstance(obj, BaseException): - return str(obj) - raise TypeError("Object of type %s with value of %r " - "is not JSON serializable" % (type(obj), obj)) - - -def try_keys(dico, *keys): - for key in keys: - if key in dico: - return dico[key] - return - - -def rebuild_url(url, base_split): - split = urllib.parse.urlsplit(url) - if split.scheme and split.netloc: - return url # url is fine - new_split = urllib.parse.SplitResult( - scheme=split.scheme or base_split.scheme, - netloc=split.netloc or base_split.netloc, - path=split.path, query='', fragment='') - return urllib.parse.urlunsplit(new_split) - - -def try_get_icon_url(url, *splits): - for split in splits: - if split is None: - continue - rb_url = rebuild_url(url, split) - response = None - # if html in content-type, we assume it's a fancy 404 page - try: - response = jarr_get(rb_url) - content_type = response.headers.get('content-type', '') - except Exception: - pass - else: - if response is not None and response.ok \ - and 'html' not in content_type and response.content: - return response.url - return None - - -def to_hash(text): - return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\ - .hexdigest() - - -def clear_string(data): - """ - Clear a string by removing HTML tags, HTML special caracters - and consecutive white spaces (more that one). - """ - p = re.compile('<[^>]+>') # HTML tags - q = re.compile('\s') # consecutive white spaces - return p.sub('', q.sub(' ', data)) - - -def redirect_url(default='home'): - return request.args.get('next') or request.referrer or url_for(default) - - -async def jarr_get(url, **kwargs): - request_kwargs = {'verify': False, 'allow_redirects': True, - 'timeout': conf.CRAWLER_TIMEOUT, - 'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}} - request_kwargs.update(kwargs) - return requests.get(url, **request_kwargs) diff --git a/src/web/lib/view_utils.py b/src/web/lib/view_utils.py index d4c119da..1d8c6aed 100644 --- a/src/web/lib/view_utils.py +++ b/src/web/lib/view_utils.py @@ -1,6 +1,6 @@ from functools import wraps from flask import request, Response, make_response -from web.lib.utils import to_hash +from lib.utils import to_hash def etag_match(func): diff --git a/src/web/models/category.py b/src/web/models/category.py index 15b616bf..2da7809a 100644 --- a/src/web/models/category.py +++ b/src/web/models/category.py @@ -1,3 +1,6 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + from bootstrap import db from sqlalchemy import Index from web.models.right_mixin import RightMixin @@ -10,7 +13,7 @@ class Category(db.Model, RightMixin): # relationships user_id = db.Column(db.Integer, db.ForeignKey('user.id')) feeds = db.relationship('Feed', cascade='all,delete-orphan') - articles = db.relationship('Article', + articles = db.relationship('Article', cascade='all,delete-orphan') # index diff --git a/src/web/models/icon.py b/src/web/models/icon.py index 22ef1164..adc9cf69 100644 --- a/src/web/models/icon.py +++ b/src/web/models/icon.py @@ -1,3 +1,6 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + from bootstrap import db diff --git a/src/web/views/admin.py b/src/web/views/admin.py index a9e1e43d..4de4009a 100644 --- a/src/web/views/admin.py +++ b/src/web/views/admin.py @@ -4,8 +4,8 @@ from flask_babel import gettext, format_timedelta from flask_login import login_required, current_user from werkzeug import generate_password_hash +from lib.utils import redirect_url from web.views.common import admin_permission -from web.lib.utils import redirect_url from web.controllers import UserController from web.forms import InformationMessageForm, UserForm diff --git a/src/web/views/article.py b/src/web/views/article.py index 283ef001..640de8b4 100644 --- a/src/web/views/article.py +++ b/src/web/views/article.py @@ -7,8 +7,8 @@ from flask_login import login_required, current_user from bootstrap import db -from web.export import export_json -from web.lib.utils import clear_string, redirect_url +from lib.utils import clear_string, redirect_url +from lib.data import export_json from web.controllers import (ArticleController, UserController, CategoryController) from web.lib.view_utils import etag_match diff --git a/src/web/views/category.py b/src/web/views/category.py index 1a81a5c4..2bdcf9cc 100644 --- a/src/web/views/category.py +++ b/src/web/views/category.py @@ -3,7 +3,7 @@ from flask_babel import gettext from flask_login import login_required, current_user from web.forms import CategoryForm -from web.lib.utils import redirect_url +from lib.utils import redirect_url from web.lib.view_utils import etag_match from web.controllers import ArticleController, FeedController, \ CategoryController diff --git a/src/web/views/common.py b/src/web/views/common.py index f9613c01..e422fd57 100644 --- a/src/web/views/common.py +++ b/src/web/views/common.py @@ -6,7 +6,7 @@ from flask_login import login_user from flask_principal import (Identity, Permission, RoleNeed, session_identity_loader, identity_changed) from web.controllers import UserController -from web.lib.utils import default_handler +from lib.utils import default_handler admin_role = RoleNeed('admin') api_role = RoleNeed('api') diff --git a/src/web/views/feed.py b/src/web/views/feed.py index 3edb942e..fa5cfc77 100644 --- a/src/web/views/feed.py +++ b/src/web/views/feed.py @@ -10,9 +10,9 @@ from flask_babel import gettext from flask_login import login_required, current_user import conf -from web.lib import misc_utils, utils +from lib import misc_utils, utils +from lib.feed_utils import construct_feed_from from web.lib.view_utils import etag_match -from web.lib.feed_utils import construct_feed_from from web.forms import AddFeedForm from web.controllers import (CategoryController, FeedController, ArticleController) diff --git a/src/web/views/home.py b/src/web/views/home.py index 179f3f9d..5274dc12 100644 --- a/src/web/views/home.py +++ b/src/web/views/home.py @@ -9,8 +9,8 @@ from flask_babel import gettext, get_locale from babel.dates import format_datetime, format_timedelta import conf -from web.lib.utils import redirect_url -from web.lib import misc_utils +from lib.utils import redirect_url +from lib import misc_utils from web.lib.view_utils import etag_match from web.views.common import jsonify diff --git a/src/web/views/user.py b/src/web/views/user.py index 91cf7e4a..58c23dd2 100644 --- a/src/web/views/user.py +++ b/src/web/views/user.py @@ -8,7 +8,8 @@ from flask_login import login_required, current_user import conf from notifications import notifications -from web.lib import misc_utils +from lib import misc_utils +from lib.data import import_opml, import_json from web.lib.user_utils import confirm_token from web.controllers import (UserController, FeedController, ArticleController, CategoryController) @@ -59,7 +60,7 @@ def management(): flash(gettext('File not allowed.'), 'danger') else: try: - nb = misc_utils.import_opml(current_user.email, data.read()) + nb = import_opml(current_user.email, data.read()) if conf.CRAWLING_METHOD == "classic": misc_utils.fetch(current_user.email, None) flash(str(nb) + ' ' + gettext('feeds imported.'), @@ -75,7 +76,7 @@ def management(): flash(gettext('File not allowed.'), 'danger') else: try: - nb = misc_utils.import_json(current_user.email, data.read()) + nb = import_json(current_user.email, data.read()) flash(gettext('Account imported.'), "success") except: flash(gettext("Impossible to import the account."), -- cgit