diff options
Diffstat (limited to 'src/lib')
-rw-r--r-- | src/lib/__init__.py | 0 | ||||
-rw-r--r-- | src/lib/article_utils.py | 186 | ||||
-rw-r--r-- | src/lib/data.py | 162 | ||||
-rw-r--r-- | src/lib/feed_utils.py | 123 | ||||
-rwxr-xr-x | src/lib/misc_utils.py | 185 | ||||
-rw-r--r-- | src/lib/utils.py | 89 |
6 files changed, 745 insertions, 0 deletions
diff --git a/src/lib/__init__.py b/src/lib/__init__.py new file mode 100644 index 00000000..e69de29b --- /dev/null +++ b/src/lib/__init__.py diff --git a/src/lib/article_utils.py b/src/lib/article_utils.py new file mode 100644 index 00000000..49494e85 --- /dev/null +++ b/src/lib/article_utils.py @@ -0,0 +1,186 @@ +import html +import logging +import re +from datetime import datetime, timezone +from enum import Enum +from urllib.parse import SplitResult, urlsplit, urlunsplit + +import dateutil.parser +from bs4 import BeautifulSoup, SoupStrainer +from requests.exceptions import MissingSchema + +import conf +from lib.utils import jarr_get + +logger = logging.getLogger(__name__) +PROCESSED_DATE_KEYS = {'published', 'created', 'updated'} + + +def extract_id(entry): + """ extract a value from an entry that will identify it among the other of + that feed""" + return entry.get('entry_id') or entry.get('id') or entry['link'] + + +async def construct_article(entry, feed, fields=None, fetch=True): + "Safe method to transorm a feedparser entry into an article" + now = datetime.utcnow() + article = {} + def push_in_article(key, value): + if not fields or key in fields: + article[key] = value + push_in_article('feed_id', feed.id) + push_in_article('user_id', feed.user_id) + push_in_article('entry_id', extract_id(entry)) + push_in_article('retrieved_date', now) + if not fields or 'date' in fields: + for date_key in PROCESSED_DATE_KEYS: + if entry.get(date_key): + try: + article['date'] = dateutil.parser.parse(entry[date_key])\ + .astimezone(timezone.utc) + except Exception as e: + logger.exception(e) + else: + break + push_in_article('content', get_article_content(entry)) + if fields is None or {'link', 'title'}.intersection(fields): + link, title = await get_article_details(entry, fetch) + push_in_article('link', link) + push_in_article('title', title) + if 'content' in article: + #push_in_article('content', clean_urls(article['content'], link)) + push_in_article('content', article['content']) + push_in_article('tags', {tag.get('term').strip() + for tag in entry.get('tags', []) \ + if tag and tag.get('term', False)}) + return article + + +def get_article_content(entry): + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + return content + + +async def get_article_details(entry, fetch=True): + article_link = entry.get('link') + article_title = html.unescape(entry.get('title', '')) + if fetch and conf.CRAWLER_RESOLV and article_link or not article_title: + try: + # resolves URL behind proxies (like feedproxy.google.com) + response = await jarr_get(article_link, timeout=5) + except MissingSchema: + split, failed = urlsplit(article_link), False + for scheme in 'https', 'http': + new_link = urlunsplit(SplitResult(scheme, *split[1:])) + try: + response = await jarr_get(new_link, timeout=5) + except Exception as error: + failed = True + continue + failed = False + article_link = new_link + break + if failed: + return article_link, article_title or 'No title' + except Exception as error: + logger.info("Unable to get the real URL of %s. Won't fix " + "link or title. Error: %s", article_link, error) + return article_link, article_title or 'No title' + article_link = response.url + if not article_title: + bs_parsed = BeautifulSoup(response.content, 'html.parser', + parse_only=SoupStrainer('head')) + try: + article_title = bs_parsed.find_all('title')[0].text + except IndexError: # no title + pass + return article_link, article_title or 'No title' + + +class FiltersAction(Enum): + READ = 'mark as read' + LIKED = 'mark as favorite' + SKIP = 'skipped' + + +class FiltersType(Enum): + REGEX = 'regex' + MATCH = 'simple match' + EXACT_MATCH = 'exact match' + TAG_MATCH = 'tag match' + TAG_CONTAINS = 'tag contains' + + +class FiltersTrigger(Enum): + MATCH = 'match' + NO_MATCH = 'no match' + + +def process_filters(filters, article, only_actions=None): + skipped, read, liked = False, None, False + filters = filters or [] + if only_actions is None: + only_actions = set(FiltersAction) + for filter_ in filters: + match = False + try: + pattern = filter_.get('pattern', '') + filter_type = FiltersType(filter_.get('type')) + filter_action = FiltersAction(filter_.get('action')) + filter_trigger = FiltersTrigger(filter_.get('action on')) + if filter_type is not FiltersType.REGEX: + pattern = pattern.lower() + except ValueError: + continue + if filter_action not in only_actions: + logger.debug('ignoring filter %r' % filter_) + continue + if filter_action in {FiltersType.REGEX, FiltersType.MATCH, + FiltersType.EXACT_MATCH} and 'title' not in article: + continue + if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \ + and 'tags' not in article: + continue + title = article.get('title', '').lower() + tags = [tag.lower() for tag in article.get('tags', [])] + if filter_type is FiltersType.REGEX: + match = re.match(pattern, title) + elif filter_type is FiltersType.MATCH: + match = pattern in title + elif filter_type is FiltersType.EXACT_MATCH: + match = pattern == title + elif filter_type is FiltersType.TAG_MATCH: + match = pattern in tags + elif filter_type is FiltersType.TAG_CONTAINS: + match = any(pattern in tag for tag in tags) + take_action = match and filter_trigger is FiltersTrigger.MATCH \ + or not match and filter_trigger is FiltersTrigger.NO_MATCH + + if not take_action: + continue + + if filter_action is FiltersAction.READ: + read = True + elif filter_action is FiltersAction.LIKED: + liked = True + elif filter_action is FiltersAction.SKIP: + skipped = True + + if skipped or read or liked: + logger.info("%r applied on %r", filter_action.value, + article.get('link') or article.get('title')) + return skipped, read, liked + + +def get_skip_and_ids(entry, feed): + entry_ids = construct_article(entry, feed, + {'entry_id', 'feed_id', 'user_id'}, fetch=False) + skipped, _, _ = process_filters(feed.filters, + construct_article(entry, feed, {'title', 'tags'}, fetch=False), + {FiltersAction.SKIP}) + return skipped, entry_ids diff --git a/src/lib/data.py b/src/lib/data.py new file mode 100644 index 00000000..d887c003 --- /dev/null +++ b/src/lib/data.py @@ -0,0 +1,162 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +# Newspipe - A Web based news aggregator. +# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org +# +# For more information : https://github.com/newspipe/newspipe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 0.1 $" +__date__ = "$Date: 2016/11/17 $" +__revision__ = "$Date: 2016/11/17 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "AGPLv3" + +# +# This file contains the import/export functions of Newspipe. +# + +import json +import opml +import datetime +from flask import jsonify + +from bootstrap import db +from web.models import User, Feed, Article + + +def import_opml(email, opml_content): + """ + Import new feeds from an OPML file. + """ + user = User.query.filter(User.email == email).first() + try: + subscriptions = opml.from_string(opml_content) + except: + logger.exception("Parsing OPML file failed:") + raise + + def read(subsubscription, nb=0): + """ + Parse recursively through the categories and sub-categories. + """ + for subscription in subsubscription: + if len(subscription) != 0: + nb = read(subscription, nb) + else: + try: + title = subscription.text + except: + title = "" + try: + description = subscription.description + except: + description = "" + try: + link = subscription.xmlUrl + except: + continue + if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first(): + continue + try: + site_link = subscription.htmlUrl + except: + site_link = "" + new_feed = Feed(title=title, description=description, + link=link, site_link=site_link, + enabled=True) + user.feeds.append(new_feed) + nb += 1 + return nb + nb = read(subscriptions) + db.session.commit() + return nb + + +def import_json(email, json_content): + """ + Import an account from a JSON file. + """ + user = User.query.filter(User.email == email).first() + json_account = json.loads(json_content.decode("utf-8")) + nb_feeds, nb_articles = 0, 0 + # Create feeds: + for feed in json_account["result"]: + if None != Feed.query.filter(Feed.user_id == user.id, + Feed.link == feed["link"]).first(): + continue + new_feed = Feed(title=feed["title"], + description="", + link=feed["link"], + site_link=feed["site_link"], + created_date=datetime.datetime. + fromtimestamp(int(feed["created_date"])), + enabled=feed["enabled"]) + user.feeds.append(new_feed) + nb_feeds += 1 + db.session.commit() + # Create articles: + for feed in json_account["result"]: + user_feed = Feed.query.filter(Feed.user_id == user.id, + Feed.link == feed["link"]).first() + if None != user_feed: + for article in feed["articles"]: + if None == Article.query.filter(Article.user_id == user.id, + Article.feed_id == user_feed.id, + Article.link == article["link"]).first(): + new_article = Article(entry_id=article["link"], + link=article["link"], + title=article["title"], + content=article["content"], + readed=article["readed"], + like=article["like"], + retrieved_date=datetime.datetime. + fromtimestamp(int(article["retrieved_date"])), + date=datetime.datetime. + fromtimestamp(int(article["date"])), + user_id=user.id, + feed_id=user_feed.id) + user_feed.articles.append(new_article) + nb_articles += 1 + db.session.commit() + return nb_feeds, nb_articles + + +def export_json(user): + """ + Export all articles of user in JSON. + """ + result = [] + for feed in user.feeds: + result.append({ + "title": feed.title, + "description": feed.description, + "link": feed.link, + "site_link": feed.site_link, + "enabled": feed.enabled, + "created_date": feed.created_date.strftime('%s'), + "articles": [ { + "title": article.title, + "link": article.link, + "content": article.content, + "readed": article.readed, + "like": article.like, + "date": article.date.strftime('%s'), + "retrieved_date": article.retrieved_date.strftime('%s') + } for article in feed.articles ] + }) + return jsonify(result=result) diff --git a/src/lib/feed_utils.py b/src/lib/feed_utils.py new file mode 100644 index 00000000..492391aa --- /dev/null +++ b/src/lib/feed_utils.py @@ -0,0 +1,123 @@ +import html +import urllib +import logging +import requests +import feedparser +from conf import CRAWLER_USER_AGENT +from bs4 import BeautifulSoup, SoupStrainer + +from lib.utils import try_keys, try_get_icon_url, rebuild_url + +logger = logging.getLogger(__name__) +logging.captureWarnings(True) +ACCEPTED_MIMETYPES = ('application/rss+xml', 'application/rdf+xml', + 'application/atom+xml', 'application/xml', 'text/xml') + + +def is_parsing_ok(parsed_feed): + return parsed_feed['entries'] or not parsed_feed['bozo'] + + +def escape_keys(*keys): + def wrapper(func): + def metawrapper(*args, **kwargs): + result = func(*args, **kwargs) + for key in keys: + if key in result: + result[key] = html.unescape(result[key] or '') + return result + return metawrapper + return wrapper + + +@escape_keys('title', 'description') +def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): + requests_kwargs = {'headers': {'User-Agent': CRAWLER_USER_AGENT}, + 'verify': False} + if url is None and fp_parsed is not None: + url = fp_parsed.get('url') + if url is not None and fp_parsed is None: + try: + response = requests.get(url, **requests_kwargs) + fp_parsed = feedparser.parse(response.content, + request_headers=response.headers) + except Exception: + logger.exception('failed to retreive that url') + fp_parsed = {'bozo': True} + assert url is not None and fp_parsed is not None + feed = feed or {} + feed_split = urllib.parse.urlsplit(url) + site_split = None + if is_parsing_ok(fp_parsed): + feed['link'] = url + feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') + feed['title'] = fp_parsed['feed'].get('title') + feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') + feed['icon_url'] = try_keys(fp_parsed['feed'], 'icon') + else: + feed['site_link'] = url + + if feed.get('site_link'): + feed['site_link'] = rebuild_url(feed['site_link'], feed_split) + site_split = urllib.parse.urlsplit(feed['site_link']) + + if feed.get('icon_url'): + feed['icon_url'] = try_get_icon_url( + feed['icon_url'], site_split, feed_split) + if feed['icon_url'] is None: + del feed['icon_url'] + + if not feed.get('site_link') or not query_site \ + or all(bool(feed.get(k)) for k in ('link', 'title', 'icon_url')): + return feed + + try: + response = requests.get(feed['site_link'], **requests_kwargs) + except Exception: + logger.exception('failed to retreive %r', feed['site_link']) + return feed + bs_parsed = BeautifulSoup(response.content, 'html.parser', + parse_only=SoupStrainer('head')) + + if not feed.get('title'): + try: + feed['title'] = bs_parsed.find_all('title')[0].text + except Exception: + pass + + def check_keys(**kwargs): + def wrapper(elem): + for key, vals in kwargs.items(): + if not elem.has_attr(key): + return False + if not all(val in elem.attrs[key] for val in vals): + return False + return True + return wrapper + + if not feed.get('icon_url'): + icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) + if not len(icons): + icons = bs_parsed.find_all(check_keys(rel=['icon'])) + if len(icons) >= 1: + for icon in icons: + feed['icon_url'] = try_get_icon_url(icon.attrs['href'], + site_split, feed_split) + if feed['icon_url'] is not None: + break + + if feed.get('icon_url') is None: + feed['icon_url'] = try_get_icon_url('/favicon.ico', + site_split, feed_split) + if 'icon_url' in feed and feed['icon_url'] is None: + del feed['icon_url'] + + if not feed.get('link'): + for type_ in ACCEPTED_MIMETYPES: + alternates = bs_parsed.find_all(check_keys( + rel=['alternate'], type=[type_])) + if len(alternates) >= 1: + feed['link'] = rebuild_url(alternates[0].attrs['href'], + feed_split) + break + return feed diff --git a/src/lib/misc_utils.py b/src/lib/misc_utils.py new file mode 100755 index 00000000..d594c01e --- /dev/null +++ b/src/lib/misc_utils.py @@ -0,0 +1,185 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +# Newspipe - A Web based news aggregator. +# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org +# +# For more information : https://github.com/newspipe/newspipe +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 1.9 $" +__date__ = "$Date: 2010/12/07 $" +__revision__ = "$Date: 2016/01/17 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "AGPLv3" + +import re +import os +import sys +import glob +import json +import logging +import operator +import urllib +import subprocess +import sqlalchemy +try: + from urlparse import urlparse, parse_qs, urlunparse +except: + from urllib.parse import urlparse, parse_qs, urlunparse, urljoin +from collections import Counter +from contextlib import contextmanager +from flask import request + +import conf +from web.controllers import ArticleController +from lib.utils import clear_string + +logger = logging.getLogger(__name__) + +ALLOWED_EXTENSIONS = set(['xml', 'opml', 'json']) + + +def is_safe_url(target): + """ + Ensures that a redirect target will lead to the same server. + """ + ref_url = urlparse(request.host_url) + test_url = urlparse(urljoin(request.host_url, target)) + return test_url.scheme in ('http', 'https') and \ + ref_url.netloc == test_url.netloc + + +def get_redirect_target(): + """ + Looks at various hints to find the redirect target. + """ + for target in request.args.get('next'), request.referrer: + if not target: + continue + if is_safe_url(target): + return target + + +def allowed_file(filename): + """ + Check if the uploaded file is allowed. + """ + return '.' in filename and \ + filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS + + +@contextmanager +def opened_w_error(filename, mode="r"): + try: + f = open(filename, mode) + except IOError as err: + yield None, err + else: + try: + yield f, None + finally: + f.close() + + +def fetch(id, feed_id=None): + """ + Fetch the feeds in a new processus. + The "asyncio" crawler is launched with the manager. + """ + cmd = [sys.executable, conf.BASE_DIR + '/manager.py', 'fetch_asyncio', + '--user_id='+str(id)] + if feed_id: + cmd.append('--feed_id='+str(feed_id)) + return subprocess.Popen(cmd, stdout=subprocess.PIPE) + + +def history(user_id, year=None, month=None): + """ + Sort articles by year and month. + """ + articles_counter = Counter() + articles = ArticleController(user_id).read() + if None != year: + articles = articles.filter(sqlalchemy.extract('year', 'Article.date') == year) + if None != month: + articles = articles.filter(sqlalchemy.extract('month', 'Article.date') == month) + for article in articles.all(): + if None != year: + articles_counter[article.date.month] += 1 + else: + articles_counter[article.date.year] += 1 + return articles_counter, articles + + +def clean_url(url): + """ + Remove utm_* parameters + """ + parsed_url = urlparse(url) + qd = parse_qs(parsed_url.query, keep_blank_values=True) + filtered = dict((k, v) for k, v in qd.items() + if not k.startswith('utm_')) + return urlunparse([ + parsed_url.scheme, + parsed_url.netloc, + urllib.parse.quote(urllib.parse.unquote(parsed_url.path)), + parsed_url.params, + urllib.parse.urlencode(filtered, doseq=True), + parsed_url.fragment + ]).rstrip('=') + + +def load_stop_words(): + """ + Load the stop words and return them in a list. + """ + stop_words_lists = glob.glob(os.path.join(conf.BASE_DIR, + 'web/var/stop_words/*.txt')) + stop_words = [] + + for stop_wods_list in stop_words_lists: + with opened_w_error(stop_wods_list, "r") as (stop_wods_file, err): + if err: + stop_words = [] + else: + stop_words += stop_wods_file.read().split(";") + return stop_words + + +def top_words(articles, n=10, size=5): + """ + Return the n most frequent words in a list. + """ + stop_words = load_stop_words() + words = Counter() + wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) + for article in articles: + for word in [elem.lower() for elem in + wordre.findall(clear_string(article.content)) \ + if elem.lower() not in stop_words]: + words[word] += 1 + return words.most_common(n) + + +def tag_cloud(tags): + """ + Generates a tags cloud. + """ + tags.sort(key=operator.itemgetter(0)) + return '\n'.join([('<font size=%d>%s</font>' % \ + (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word)) \ + for (word, count) in tags]) diff --git a/src/lib/utils.py b/src/lib/utils.py new file mode 100644 index 00000000..d206b769 --- /dev/null +++ b/src/lib/utils.py @@ -0,0 +1,89 @@ +import re +import types +import urllib +import logging +import requests +from hashlib import md5 +from flask import request, url_for + +import conf + +logger = logging.getLogger(__name__) + + +def default_handler(obj, role='admin'): + """JSON handler for default query formatting""" + if hasattr(obj, 'isoformat'): + return obj.isoformat() + if hasattr(obj, 'dump'): + return obj.dump(role=role) + if isinstance(obj, (set, frozenset, types.GeneratorType)): + return list(obj) + if isinstance(obj, BaseException): + return str(obj) + raise TypeError("Object of type %s with value of %r " + "is not JSON serializable" % (type(obj), obj)) + + +def try_keys(dico, *keys): + for key in keys: + if key in dico: + return dico[key] + return + + +def rebuild_url(url, base_split): + split = urllib.parse.urlsplit(url) + if split.scheme and split.netloc: + return url # url is fine + new_split = urllib.parse.SplitResult( + scheme=split.scheme or base_split.scheme, + netloc=split.netloc or base_split.netloc, + path=split.path, query='', fragment='') + return urllib.parse.urlunsplit(new_split) + + +def try_get_icon_url(url, *splits): + for split in splits: + if split is None: + continue + rb_url = rebuild_url(url, split) + response = None + # if html in content-type, we assume it's a fancy 404 page + try: + response = jarr_get(rb_url) + content_type = response.headers.get('content-type', '') + except Exception: + pass + else: + if response is not None and response.ok \ + and 'html' not in content_type and response.content: + return response.url + return None + + +def to_hash(text): + return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\ + .hexdigest() + + +def clear_string(data): + """ + Clear a string by removing HTML tags, HTML special caracters + and consecutive white spaces (more that one). + """ + p = re.compile('<[^>]+>') # HTML tags + q = re.compile('\s') # consecutive white spaces + return p.sub('', q.sub(' ', data)) + + +def redirect_url(default='home'): + return request.args.get('next') or request.referrer or url_for(default) + + +async def jarr_get(url, **kwargs): + request_kwargs = {'verify': False, 'allow_redirects': True, + 'timeout': conf.CRAWLER_TIMEOUT, + 'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}} + request_kwargs.update(kwargs) + return requests.get(url, **request_kwargs) |