aboutsummaryrefslogtreecommitdiff
path: root/src/web/lib
diff options
context:
space:
mode:
Diffstat (limited to 'src/web/lib')
-rw-r--r--src/web/lib/crawler.py7
-rw-r--r--src/web/lib/feed_utils.py13
-rw-r--r--src/web/lib/utils.py16
3 files changed, 28 insertions, 8 deletions
diff --git a/src/web/lib/crawler.py b/src/web/lib/crawler.py
index 7343ea4d..f480fe96 100644
--- a/src/web/lib/crawler.py
+++ b/src/web/lib/crawler.py
@@ -18,7 +18,6 @@ import json
import logging
import feedparser
from datetime import datetime, timedelta
-from functools import wraps
from time import strftime, gmtime
from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession
@@ -132,7 +131,7 @@ class PyAggUpdater(AbstractCrawler):
{key: "%s -> %s" % (up_feed[key], self.feed.get(key))
for key in up_feed if up_feed[key] != self.feed.get(key)})
- future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed)
+ self.query_pyagg('put', 'feed/%d' % self.feed['id'], up_feed)
class FeedCrawler(AbstractCrawler):
@@ -144,8 +143,8 @@ class FeedCrawler(AbstractCrawler):
def clean_feed(self):
"""Will reset the errors counters on a feed that have known errors"""
if self.feed.get('error_count') or self.feed.get('last_error'):
- future = self.query_pyagg('put', 'feed/%d' % self.feed['id'],
- {'error_count': 0, 'last_error': ''})
+ self.query_pyagg('put', 'feed/%d' % self.feed['id'],
+ {'error_count': 0, 'last_error': ''})
def callback(self, response):
"""will fetch the feed and interprete results (304, etag) or will
diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py
index 14e6b82b..80800bec 100644
--- a/src/web/lib/feed_utils.py
+++ b/src/web/lib/feed_utils.py
@@ -9,6 +9,8 @@ from web.lib.utils import try_keys, try_get_icon_url, rebuild_url
logger = logging.getLogger(__name__)
logging.captureWarnings(True)
+ACCEPTED_MIMETYPES = ('application/rss+xml', 'application/rdf+xml',
+ 'application/atom+xml', 'application/xml', 'text/xml')
def is_parsing_ok(parsed_feed):
@@ -96,8 +98,11 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
del feed['icon_url']
if not feed.get('link'):
- alternates = bs_parsed.find_all(check_keys(rel=['alternate'],
- type=['application/rss+xml']))
- if len(alternates) >= 1:
- feed['link'] = rebuild_url(alternates[0].attrs['href'], feed_split)
+ for type_ in ACCEPTED_MIMETYPES:
+ alternates = bs_parsed.find_all(check_keys(
+ rel=['alternate'], type=[type_]))
+ if len(alternates) >= 1:
+ feed['link'] = rebuild_url(alternates[0].attrs['href'],
+ feed_split)
+ break
return feed
diff --git a/src/web/lib/utils.py b/src/web/lib/utils.py
index aa552a12..88d24ba5 100644
--- a/src/web/lib/utils.py
+++ b/src/web/lib/utils.py
@@ -1,8 +1,10 @@
+import re
import types
import urllib
import logging
import requests
from hashlib import md5
+from flask import request, url_for
logger = logging.getLogger(__name__)
@@ -55,3 +57,17 @@ def try_get_icon_url(url, *splits):
def to_hash(text):
return md5(text.encode('utf8') if hasattr(text, 'encode') else text)\
.hexdigest()
+
+
+def clear_string(data):
+ """
+ Clear a string by removing HTML tags, HTML special caracters
+ and consecutive white spaces (more that one).
+ """
+ p = re.compile('<[^>]+>') # HTML tags
+ q = re.compile('\s') # consecutive white spaces
+ return p.sub('', q.sub(' ', data))
+
+
+def redirect_url(default='home'):
+ return request.args.get('next') or request.referrer or url_for(default)
bgstack15