diff options
Diffstat (limited to 'src/web/lib')
-rw-r--r-- | src/web/lib/article_utils.py | 186 | ||||
-rw-r--r-- | src/web/lib/feed_utils.py | 123 | ||||
-rwxr-xr-x | src/web/lib/misc_utils.py | 286 | ||||
-rw-r--r-- | src/web/lib/utils.py | 89 | ||||
-rw-r--r-- | src/web/lib/view_utils.py | 2 |
5 files changed, 1 insertions, 685 deletions
diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py deleted file mode 100644 index 2c5ea8c3..00000000 --- a/src/web/lib/article_utils.py +++ /dev/null @@ -1,186 +0,0 @@ -import html -import logging -import re -from datetime import datetime, timezone -from enum import Enum -from urllib.parse import SplitResult, urlsplit, urlunsplit - -import dateutil.parser -from bs4 import BeautifulSoup, SoupStrainer -from requests.exceptions import MissingSchema - -import conf -from web.lib.utils import jarr_get - -logger = logging.getLogger(__name__) -PROCESSED_DATE_KEYS = {'published', 'created', 'updated'} - - -def extract_id(entry): - """ extract a value from an entry that will identify it among the other of - that feed""" - return entry.get('entry_id') or entry.get('id') or entry['link'] - - -async def construct_article(entry, feed, fields=None, fetch=True): - "Safe method to transorm a feedparser entry into an article" - now = datetime.utcnow() - article = {} - def push_in_article(key, value): - if not fields or key in fields: - article[key] = value - push_in_article('feed_id', feed.id) - push_in_article('user_id', feed.user_id) - push_in_article('entry_id', extract_id(entry)) - push_in_article('retrieved_date', now) - if not fields or 'date' in fields: - for date_key in PROCESSED_DATE_KEYS: - if entry.get(date_key): - try: - article['date'] = dateutil.parser.parse(entry[date_key])\ - .astimezone(timezone.utc) - except Exception as e: - logger.exception(e) - else: - break - push_in_article('content', get_article_content(entry)) - if fields is None or {'link', 'title'}.intersection(fields): - link, title = await get_article_details(entry, fetch) - push_in_article('link', link) - push_in_article('title', title) - if 'content' in article: - #push_in_article('content', clean_urls(article['content'], link)) - push_in_article('content', article['content']) - push_in_article('tags', {tag.get('term').strip() - for tag in entry.get('tags', []) \ - if tag and tag.get('term', False)}) - return article - - -def get_article_content(entry): - content = '' - if entry.get('content'): - content = entry['content'][0]['value'] - elif entry.get('summary'): - content = entry['summary'] - return content - - -async def get_article_details(entry, fetch=True): - article_link = entry.get('link') - article_title = html.unescape(entry.get('title', '')) - if fetch and conf.CRAWLER_RESOLV and article_link or not article_title: - try: - # resolves URL behind proxies (like feedproxy.google.com) - response = await jarr_get(article_link, timeout=5) - except MissingSchema: - split, failed = urlsplit(article_link), False - for scheme in 'https', 'http': - new_link = urlunsplit(SplitResult(scheme, *split[1:])) - try: - response = await jarr_get(new_link, timeout=5) - except Exception as error: - failed = True - continue - failed = False - article_link = new_link - break - if failed: - return article_link, article_title or 'No title' - except Exception as error: - logger.info("Unable to get the real URL of %s. Won't fix " - "link or title. Error: %s", article_link, error) - return article_link, article_title or 'No title' - article_link = response.url - if not article_title: - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - try: - article_title = bs_parsed.find_all('title')[0].text - except IndexError: # no title - pass - return article_link, article_title or 'No title' - - -class FiltersAction(Enum): - READ = 'mark as read' - LIKED = 'mark as favorite' - SKIP = 'skipped' - - -class FiltersType(Enum): - REGEX = 'regex' - MATCH = 'simple match' - EXACT_MATCH = 'exact match' - TAG_MATCH = 'tag match' - TAG_CONTAINS = 'tag contains' - - -class FiltersTrigger(Enum): - MATCH = 'match' - NO_MATCH = 'no match' - - -def process_filters(filters, article, only_actions=None): - skipped, read, liked = False, None, False - filters = filters or [] - if only_actions is None: - only_actions = set(FiltersAction) - for filter_ in filters: - match = False - try: - pattern = filter_.get('pattern', '') - filter_type = FiltersType(filter_.get('type')) - filter_action = FiltersAction(filter_.get('action')) - filter_trigger = FiltersTrigger(filter_.get('action on')) - if filter_type is not FiltersType.REGEX: - pattern = pattern.lower() - except ValueError: - continue - if filter_action not in only_actions: - logger.debug('ignoring filter %r' % filter_) - continue - if filter_action in {FiltersType.REGEX, FiltersType.MATCH, - FiltersType.EXACT_MATCH} and 'title' not in article: - continue - if filter_action in {FiltersType.TAG_MATCH, FiltersType.TAG_CONTAINS} \ - and 'tags' not in article: - continue - title = article.get('title', '').lower() - tags = [tag.lower() for tag in article.get('tags', [])] - if filter_type is FiltersType.REGEX: - match = re.match(pattern, title) - elif filter_type is FiltersType.MATCH: - match = pattern in title - elif filter_type is FiltersType.EXACT_MATCH: - match = pattern == title - elif filter_type is FiltersType.TAG_MATCH: - match = pattern in tags - elif filter_type is FiltersType.TAG_CONTAINS: - match = any(pattern in tag for tag in tags) - take_action = match and filter_trigger is FiltersTrigger.MATCH \ - or not match and filter_trigger is FiltersTrigger.NO_MATCH - - if not take_action: - continue - - if filter_action is FiltersAction.READ: - read = True - elif filter_action is FiltersAction.LIKED: - liked = True - elif filter_action is FiltersAction.SKIP: - skipped = True - - if skipped or read or liked: - logger.info("%r applied on %r", filter_action.value, - article.get('link') or article.get('title')) - return skipped, read, liked - - -def get_skip_and_ids(entry, feed): - entry_ids = construct_article(entry, feed, - {'entry_id', 'feed_id', 'user_id'}, fetch=False) - skipped, _, _ = process_filters(feed.filters, - construct_article(entry, feed, {'title', 'tags'}, fetch=False), - {FiltersAction.SKIP}) - return skipped, entry_ids diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py deleted file mode 100644 index ef5d4f08..00000000 --- a/src/web/lib/feed_utils.py +++ /dev/null @@ -1,123 +0,0 @@ -import html -import urllib -import logging -import requests -import feedparser -from conf import CRAWLER_USER_AGENT -from bs4 import BeautifulSoup, SoupStrainer - -from web.lib.utils import try_keys, try_get_icon_url, rebuild_url - -logger = logging.getLogger(__name__) -logging.captureWarnings(True) -ACCEPTED_MIMETYPES = ('application/rss+xml', 'application/rdf+xml', - 'application/atom+xml', 'application/xml', 'text/xml') - - -def is_parsing_ok(parsed_feed): - return parsed_feed['entries'] or not parsed_feed['bozo'] - - -def escape_keys(*keys): - def wrapper(func): - def metawrapper(*args, **kwargs): - result = func(*args, **kwargs) - for key in keys: - if key in result: - result[key] = html.unescape(result[key] or '') - return result - return metawrapper - return wrapper - - -@escape_keys('title', 'description') -def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): - requests_kwargs = {'headers': {'User-Agent': CRAWLER_USER_AGENT}, - 'verify': False} - if url is None and fp_parsed is not None: - url = fp_parsed.get('url') - if url is not None and fp_parsed is None: - try: - response = requests.get(url, **requests_kwargs) - fp_parsed = feedparser.parse(response.content, - request_headers=response.headers) - except Exception: - logger.exception('failed to retreive that url') - fp_parsed = {'bozo': True} - assert url is not None and fp_parsed is not None - feed = feed or {} - feed_split = urllib.parse.urlsplit(url) - site_split = None - if is_parsing_ok(fp_parsed): - feed['link'] = url - feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') - feed['title'] = fp_parsed['feed'].get('title') - feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title') - feed['icon_url'] = try_keys(fp_parsed['feed'], 'icon') - else: - feed['site_link'] = url - - if feed.get('site_link'): - feed['site_link'] = rebuild_url(feed['site_link'], feed_split) - site_split = urllib.parse.urlsplit(feed['site_link']) - - if feed.get('icon_url'): - feed['icon_url'] = try_get_icon_url( - feed['icon_url'], site_split, feed_split) - if feed['icon_url'] is None: - del feed['icon_url'] - - if not feed.get('site_link') or not query_site \ - or all(bool(feed.get(k)) for k in ('link', 'title', 'icon_url')): - return feed - - try: - response = requests.get(feed['site_link'], **requests_kwargs) - except Exception: - logger.exception('failed to retreive %r', feed['site_link']) - return feed - bs_parsed = BeautifulSoup(response.content, 'html.parser', - parse_only=SoupStrainer('head')) - - if not feed.get('title'): - try: - feed['title'] = bs_parsed.find_all('title')[0].text - except Exception: - pass - - def check_keys(**kwargs): - def wrapper(elem): - for key, vals in kwargs.items(): - if not elem.has_attr(key): - return False - if not all(val in elem.attrs[key] for val in vals): - return False - return True - return wrapper - - if not feed.get('icon_url'): - icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut'])) - if not len(icons): - icons = bs_parsed.find_all(check_keys(rel=['icon'])) - if len(icons) >= 1: - for icon in icons: - feed['icon_url'] = try_get_icon_url(icon.attrs['href'], - site_split, feed_split) - if feed['icon_url'] is not None: - break - - if feed.get('icon_url') is None: - feed['icon_url'] = try_get_icon_url('/favicon.ico', - site_split, feed_split) - if 'icon_url' in feed and feed['icon_url'] is None: - del feed['icon_url'] - - if not feed.get('link'): - for type_ in ACCEPTED_MIMETYPES: - alternates = bs_parsed.find_all(check_keys( - rel=['alternate'], type=[type_])) - if len(alternates) >= 1: - feed['link'] = rebuild_url(alternates[0].attrs['href'], - feed_split) - break - return feed diff --git a/src/web/lib/misc_utils.py b/src/web/lib/misc_utils.py deleted file mode 100755 index 6a0e00ec..00000000 --- a/src/web/lib/misc_utils.py +++ /dev/null @@ -1,286 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# Newspipe - A Web based news aggregator. -# Copyright (C) 2010-2016 Cédric Bonhomme - https://www.cedricbonhomme.org -# -# For more information : https://github.com/Newspipe/Newspipe -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.8 $" -__date__ = "$Date: 2010/12/07 $" -__revision__ = "$Date: 2016/04/10 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "AGPLv3" - -# -# This file provides functions used for: -# - import from a JSON file; -# - generation of tags cloud; -# - HTML processing. -# - -import re -import os -import sys -import glob -import opml -import json -import logging -import datetime -import operator -import urllib -import subprocess -import sqlalchemy -try: - from urlparse import urlparse, parse_qs, urlunparse -except: - from urllib.parse import urlparse, parse_qs, urlunparse, urljoin -from bs4 import BeautifulSoup -from collections import Counter -from contextlib import contextmanager -from flask import request - -import conf -from bootstrap import db -from web import controllers -from web.models import User, Feed, Article -from web.lib.utils import clear_string - -logger = logging.getLogger(__name__) - -ALLOWED_EXTENSIONS = set(['xml', 'opml', 'json']) - -def is_safe_url(target): - """ - Ensures that a redirect target will lead to the same server. - """ - ref_url = urlparse(request.host_url) - test_url = urlparse(urljoin(request.host_url, target)) - return test_url.scheme in ('http', 'https') and \ - ref_url.netloc == test_url.netloc - -def get_redirect_target(): - """ - Looks at various hints to find the redirect target. - """ - for target in request.args.get('next'), request.referrer: - if not target: - continue - if is_safe_url(target): - return target - -def allowed_file(filename): - """ - Check if the uploaded file is allowed. - """ - return '.' in filename and \ - filename.rsplit('.', 1)[1] in ALLOWED_EXTENSIONS - -@contextmanager -def opened_w_error(filename, mode="r"): - try: - f = open(filename, mode) - except IOError as err: - yield None, err - else: - try: - yield f, None - finally: - f.close() - -def fetch(id, feed_id=None): - """ - Fetch the feeds in a new processus. - The "asyncio" crawler is launched with the manager. - """ - cmd = [sys.executable, conf.BASE_DIR + '/manager.py', 'fetch_asyncio', - '--user_id='+str(id)] - if feed_id: - cmd.append('--feed_id='+str(feed_id)) - return subprocess.Popen(cmd, stdout=subprocess.PIPE) - -def history(user_id, year=None, month=None): - """ - Sort articles by year and month. - """ - articles_counter = Counter() - articles = controllers.ArticleController(user_id).read() - if None != year: - articles = articles.filter(sqlalchemy.extract('year', Article.date) == year) - if None != month: - articles = articles.filter(sqlalchemy.extract('month', Article.date) == month) - for article in articles.all(): - if None != year: - articles_counter[article.date.month] += 1 - else: - articles_counter[article.date.year] += 1 - return articles_counter, articles - -def import_opml(email, opml_content): - """ - Import new feeds from an OPML file. - """ - user = User.query.filter(User.email == email).first() - try: - subscriptions = opml.from_string(opml_content) - except: - logger.exception("Parsing OPML file failed:") - raise - - def read(subsubscription, nb=0): - """ - Parse recursively through the categories and sub-categories. - """ - for subscription in subsubscription: - if len(subscription) != 0: - nb = read(subscription, nb) - else: - try: - title = subscription.text - except: - title = "" - try: - description = subscription.description - except: - description = "" - try: - link = subscription.xmlUrl - except: - continue - if None != Feed.query.filter(Feed.user_id == user.id, Feed.link == link).first(): - continue - try: - site_link = subscription.htmlUrl - except: - site_link = "" - new_feed = Feed(title=title, description=description, - link=link, site_link=site_link, - enabled=True) - user.feeds.append(new_feed) - nb += 1 - return nb - nb = read(subscriptions) - db.session.commit() - return nb - -def import_json(email, json_content): - """ - Import an account from a JSON file. - """ - user = User.query.filter(User.email == email).first() - json_account = json.loads(json_content.decode("utf-8")) - nb_feeds, nb_articles = 0, 0 - # Create feeds: - for feed in json_account["result"]: - if None != Feed.query.filter(Feed.user_id == user.id, - Feed.link == feed["link"]).first(): - continue - new_feed = Feed(title=feed["title"], - description="", - link=feed["link"], - site_link=feed["site_link"], - created_date=datetime.datetime. - fromtimestamp(int(feed["created_date"])), - enabled=feed["enabled"]) - user.feeds.append(new_feed) - nb_feeds += 1 - db.session.commit() - # Create articles: - for feed in json_account["result"]: - user_feed = Feed.query.filter(Feed.user_id == user.id, - Feed.link == feed["link"]).first() - if None != user_feed: - for article in feed["articles"]: - if None == Article.query.filter(Article.user_id == user.id, - Article.feed_id == user_feed.id, - Article.link == article["link"]).first(): - new_article = Article(entry_id=article["link"], - link=article["link"], - title=article["title"], - content=article["content"], - readed=article["readed"], - like=article["like"], - retrieved_date=datetime.datetime. - fromtimestamp(int(article["retrieved_date"])), - date=datetime.datetime. - fromtimestamp(int(article["date"])), - user_id=user.id, - feed_id=user_feed.id) - user_feed.articles.append(new_article) - nb_articles += 1 - db.session.commit() - return nb_feeds, nb_articles - -def clean_url(url): - """ - Remove utm_* parameters - """ - parsed_url = urlparse(url) - qd = parse_qs(parsed_url.query, keep_blank_values=True) - filtered = dict((k, v) for k, v in qd.items() - if not k.startswith('utm_')) - return urlunparse([ - parsed_url.scheme, - parsed_url.netloc, - urllib.parse.quote(urllib.parse.unquote(parsed_url.path)), - parsed_url.params, - urllib.parse.urlencode(filtered, doseq=True), - parsed_url.fragment - ]).rstrip('=') - -def load_stop_words(): - """ - Load the stop words and return them in a list. - """ - stop_words_lists = glob.glob(os.path.join(conf.BASE_DIR, - 'web/var/stop_words/*.txt')) - stop_words = [] - - for stop_wods_list in stop_words_lists: - with opened_w_error(stop_wods_list, "r") as (stop_wods_file, err): - if err: - stop_words = [] - else: - stop_words += stop_wods_file.read().split(";") - return stop_words - -def top_words(articles, n=10, size=5): - """ - Return the n most frequent words in a list. - """ - stop_words = load_stop_words() - words = Counter() - wordre = re.compile(r'\b\w{%s,}\b' % size, re.I) - for article in articles: - for word in [elem.lower() for elem in - wordre.findall(clear_string(article.content)) \ - if elem.lower() not in stop_words]: - words[word] += 1 - return words.most_common(n) - -def tag_cloud(tags): - """ - Generates a tags cloud. - """ - tags.sort(key=operator.itemgetter(0)) - return '\n'.join([('<font size=%d>%s</font>' % \ - (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word)) \ - for (word, count) in tags]) - -if __name__ == "__main__": - import_opml("root@newspipe.localhost", "./var/feeds_test.opml") - #import_opml("root@newspipe.localhost", "./var/Newspipe.opml") diff --git a/src/web/lib/utils.py b/src/web/lib/utils.py deleted file mode 100644 index d206b769..00000000 --- a/src/web/lib/utils.py +++ /dev/null @@ -1,89 +0,0 @@ -import re -import types -import urllib -import logging -import requests -from hashlib import md5 -from flask import request, url_for - -import conf - -logger = logging.getLogger(__name__) - - -def default_handler(obj, role='admin'): - """JSON handler for default query formatting""" - if hasattr(obj, 'isoformat'): - return obj.isoformat() - if hasattr(obj, 'dump'): - return obj.dump(role=role) - if isinstance(obj, (set, frozenset, types.GeneratorType)): - return list(obj) - if isinstance(obj, BaseException): - return str(obj) - raise TypeError("Object of type %s with value of %r " - "is not JSON serializable" % (type(obj), obj)) - - -def try_keys(dico, *keys): - for key in keys: - if key in dico: - return dico[key] - return - - -def rebuild_url(url, base_split): - split = urllib.parse.urlsplit(url) - if split.scheme and split.netloc: - return url # url is fine - new_split = urllib.parse.SplitResult( - scheme=split.scheme or base_split.scheme, - netloc=split.netloc or base_split.netloc, - path=split.path, query='', fragment='') - return urllib.parse.urlunsplit(new_split) - - -def try_get_icon_url(url, *splits): - for split in splits: - if split is None: - continue - rb_url = rebuild_url(url, split) - response = None - # if html in content-type, we assume it's a fancy 404 page - try: - response = jarr_get(rb_url) - content_type = response.headers.get('content-type', '') - except Exception: - pass - else: - if response is not None and response.ok \ - and 'html' not in content_type and response.content: - return response.url - return None - - -def to_hash(text): - return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\ - .hexdigest() - - -def clear_string(data): - """ - Clear a string by removing HTML tags, HTML special caracters - and consecutive white spaces (more that one). - """ - p = re.compile('<[^>]+>') # HTML tags - q = re.compile('\s') # consecutive white spaces - return p.sub('', q.sub(' ', data)) - - -def redirect_url(default='home'): - return request.args.get('next') or request.referrer or url_for(default) - - -async def jarr_get(url, **kwargs): - request_kwargs = {'verify': False, 'allow_redirects': True, - 'timeout': conf.CRAWLER_TIMEOUT, - 'headers': {'User-Agent': conf.CRAWLER_USER_AGENT}} - request_kwargs.update(kwargs) - return requests.get(url, **request_kwargs) diff --git a/src/web/lib/view_utils.py b/src/web/lib/view_utils.py index d4c119da..1d8c6aed 100644 --- a/src/web/lib/view_utils.py +++ b/src/web/lib/view_utils.py @@ -1,6 +1,6 @@ from functools import wraps from flask import request, Response, make_response -from web.lib.utils import to_hash +from lib.utils import to_hash def etag_match(func): |