diff options
Diffstat (limited to 'src/web/lib')
-rw-r--r-- | src/web/lib/crawler.py | 7 | ||||
-rw-r--r-- | src/web/lib/feed_utils.py | 13 | ||||
-rw-r--r-- | src/web/lib/utils.py | 16 |
3 files changed, 28 insertions, 8 deletions
diff --git a/src/web/lib/crawler.py b/src/web/lib/crawler.py index 7343ea4d..f480fe96 100644 --- a/src/web/lib/crawler.py +++ b/src/web/lib/crawler.py @@ -18,7 +18,6 @@ import json import logging import feedparser from datetime import datetime, timedelta -from functools import wraps from time import strftime, gmtime from concurrent.futures import ThreadPoolExecutor from requests_futures.sessions import FuturesSession @@ -132,7 +131,7 @@ class PyAggUpdater(AbstractCrawler): {key: "%s -> %s" % (up_feed[key], self.feed.get(key)) for key in up_feed if up_feed[key] != self.feed.get(key)}) - future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed) + self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed) class FeedCrawler(AbstractCrawler): @@ -144,8 +143,8 @@ class FeedCrawler(AbstractCrawler): def clean_feed(self): """Will reset the errors counters on a feed that have known errors""" if self.feed.get('error_count') or self.feed.get('last_error'): - future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], - {'error_count': 0, 'last_error': ''}) + self.query_pyagg('put', 'feed/%d' % self.feed['id'], + {'error_count': 0, 'last_error': ''}) def callback(self, response): """will fetch the feed and interprete results (304, etag) or will diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py index 14e6b82b..80800bec 100644 --- a/src/web/lib/feed_utils.py +++ b/src/web/lib/feed_utils.py @@ -9,6 +9,8 @@ from web.lib.utils import try_keys, try_get_icon_url, rebuild_url logger = logging.getLogger(__name__) logging.captureWarnings(True) +ACCEPTED_MIMETYPES = ('application/rss+xml', 'application/rdf+xml', + 'application/atom+xml', 'application/xml', 'text/xml') def is_parsing_ok(parsed_feed): @@ -96,8 +98,11 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): del feed['icon_url'] if not feed.get('link'): - alternates = bs_parsed.find_all(check_keys(rel=['alternate'], - type=['application/rss+xml'])) - if len(alternates) >= 1: - feed['link'] = rebuild_url(alternates[0].attrs['href'], feed_split) + for type_ in ACCEPTED_MIMETYPES: + alternates = bs_parsed.find_all(check_keys( + rel=['alternate'], type=[type_])) + if len(alternates) >= 1: + feed['link'] = rebuild_url(alternates[0].attrs['href'], + feed_split) + break return feed diff --git a/src/web/lib/utils.py b/src/web/lib/utils.py index aa552a12..88d24ba5 100644 --- a/src/web/lib/utils.py +++ b/src/web/lib/utils.py @@ -1,8 +1,10 @@ +import re import types import urllib import logging import requests from hashlib import md5 +from flask import request, url_for logger = logging.getLogger(__name__) @@ -55,3 +57,17 @@ def try_get_icon_url(url, *splits): def to_hash(text): return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\ .hexdigest() + + +def clear_string(data): + """ + Clear a string by removing HTML tags, HTML special caracters + and consecutive white spaces (more that one). + """ + p = re.compile('<[^>]+>') # HTML tags + q = re.compile('\s') # consecutive white spaces + return p.sub('', q.sub(' ', data)) + + +def redirect_url(default='home'): + return request.args.get('next') or request.referrer or url_for(default) |