aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib
diff options
context:
space:
mode:
authorFrançois Schmidts <francois.schmidts@gmail.com>2015-07-03 11:49:28 +0200
committerFrançois Schmidts <francois.schmidts@gmail.com>2015-07-03 15:01:23 +0200
commit1513cd97911fdf4500ae17f7e8ee6d90ac4bac84 (patch)
treeb66771229ecbffdf3e784859b33dcf65aaf0d603 /pyaggr3g470r/lib
parentMinor improvements to the edit feed forms. (diff)
downloadnewspipe-1513cd97911fdf4500ae17f7e8ee6d90ac4bac84.tar.gz
newspipe-1513cd97911fdf4500ae17f7e8ee6d90ac4bac84.tar.bz2
newspipe-1513cd97911fdf4500ae17f7e8ee6d90ac4bac84.zip
the icon feature
* icon of feeds is now an url retrieved from the feed or the site link * the icon is displayed in the home page making it visually easier to read * the http crawler is in charge of keeping it up to date
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r--pyaggr3g470r/lib/crawler.py33
-rw-r--r--pyaggr3g470r/lib/utils.py95
2 files changed, 115 insertions, 13 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 324f0d8e..2ba5403a 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -24,7 +24,7 @@ from datetime import datetime
from time import strftime, gmtime
from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession
-from pyaggr3g470r.lib.utils import default_handler
+from pyaggr3g470r.lib.utils import default_handler, construct_feed_from
logger = logging.getLogger(__name__)
logging.captureWarnings(True)
@@ -136,7 +136,7 @@ class PyAggUpdater(AbstractCrawler):
self.feed = feed
self.entries = entries
self.headers = headers
- self.parsed_feed = parsed_feed.get('feed', {})
+ self.parsed_feed = parsed_feed
super(PyAggUpdater, self).__init__(auth)
def to_article(self, entry):
@@ -188,19 +188,26 @@ class PyAggUpdater(AbstractCrawler):
self.headers.get('etag', ''),
self.headers.get('last-modified', ''))
- dico = {'error_count': 0, 'last_error': None,
- 'etag': self.headers.get('etag', ''),
- 'last_modified': self.headers.get('last-modified',
- strftime('%a, %d %b %Y %X %Z', gmtime())),
- 'site_link': self.parsed_feed.get('link')}
+ up_feed = {'error_count': 0, 'last_error': None,
+ 'etag': self.headers.get('etag', ''),
+ 'last_modified': self.headers.get('last-modified',
+ strftime('%a, %d %b %Y %X %Z', gmtime()))}
+ fresh_feed = construct_feed_from(url=self.feed['link'],
+ fp_parsed=self.parsed_feed,
+ feed=self.feed)
+ for key in ('description', 'site_link', 'icon'):
+ if fresh_feed.get(key) and fresh_feed[key] != self.feed.get(key):
+ up_feed[key] = fresh_feed[key]
if not self.feed.get('title'):
- dico['title'] = self.parsed_feed.get('title', '')
+ up_feed['title'] = fresh_feed.get('title', '')
+
logger.info('%r %r - pushing feed attrs %r',
self.feed['id'], self.feed['title'],
- {key: "%s -> %s" % (dico[key], self.feed.get(key))
- for key in dico if dico[key] != self.feed.get(key)})
- if any([dico[key] != self.feed.get(key) for key in dico]):
- future = self.query_pyagg('put', 'feed/%d' % self.feed['id'], dico)
+ {key: "%s -> %s" % (up_feed[key], self.feed.get(key))
+ for key in up_feed if up_feed[key] != self.feed.get(key)})
+ if any([up_feed[key] != self.feed.get(key) for key in up_feed]):
+ future = self.query_pyagg('put',
+ 'feed/%d' % self.feed['id'], up_feed)
future.add_done_callback(self.get_counter_callback())
@@ -265,7 +272,7 @@ class FeedCrawler(AbstractCrawler):
self.feed['id'], self.feed['title'])
ids, entries = [], {}
- parsed_response = feedparser.parse(response.text)
+ parsed_response = feedparser.parse(response.content)
for entry in parsed_response['entries']:
entry_ids = extract_id(entry)
entry_ids['feed_id'] = self.feed['id']
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index a4f4b3ec..a0154b7f 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -1,4 +1,9 @@
import types
+import urllib
+import requests
+import feedparser
+from bs4 import BeautifulSoup, SoupStrainer
+
def default_handler(obj):
"""JSON handler for default query formatting"""
@@ -12,3 +17,93 @@ def default_handler(obj):
return str(obj)
raise TypeError("Object of type %s with value of %r "
"is not JSON serializable" % (type(obj), obj))
+
+
+def try_keys(dico, *keys):
+ for key in keys:
+ if key in dico:
+ return dico[key]
+ return
+
+
+def rebuild_url(url, base_split):
+ split = urllib.parse.urlsplit(url)
+ if split.scheme and split.netloc:
+ return url # url is fine
+ new_split = urllib.parse.SplitResult(
+ scheme=split.scheme or base_split.scheme,
+ netloc=split.netloc or base_split.netloc,
+ path=split.path, query='', fragment='')
+ return urllib.parse.urlunsplit(new_split)
+
+
+def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
+ if url is None and fp_parsed is not None:
+ url = fp_parsed.get('url')
+ if url is not None and fp_parsed is None:
+ response = requests.get(url, verify=False)
+ fp_parsed = feedparser.parse(response.content)
+ assert url is not None and fp_parsed is not None
+ feed = feed or {}
+ split = urllib.parse.urlsplit(url)
+ if not fp_parsed['bozo']:
+ feed['link'] = url
+ feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
+ feed['title'] = fp_parsed['feed'].get('title')
+ feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
+ feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
+ else:
+ feed['site_link'] = url
+
+ if feed.get('site_link'):
+ feed['site_link'] = rebuild_url(feed['site_link'], split)
+ split = urllib.parse.urlsplit(feed['site_link'])
+
+ if feed.get('icon'):
+ feed['icon'] = rebuild_url(feed['icon'], split)
+
+ if not feed.get('site_link') or not query_site \
+ or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
+ return feed
+
+ response = requests.get(feed['site_link'], verify=False)
+ bs_parsed = BeautifulSoup(response.content, 'html.parser',
+ parse_only=SoupStrainer('head'))
+
+ if not feed.get('title'):
+ try:
+ feed['title'] = bs_parsed.find_all('title')[0].text
+ except Exception:
+ pass
+
+ def check_keys(**kwargs):
+ def wrapper(elem):
+ for key, vals in kwargs.items():
+ if not elem.has_attr(key):
+ return False
+ if not all(val in elem.attrs[key] for val in vals):
+ return False
+ return True
+ return wrapper
+
+ if not feed.get('icon'):
+ icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
+ if not len(icons):
+ icons = bs_parsed.find_all(check_keys(rel=['icon']))
+ if len(icons) >= 1:
+ feed['icon'] = rebuild_url(icons[0].attrs['href'], split)
+ else: # trying the default one
+ icon = rebuild_url('/favicon.ico', split)
+ if requests.get(icon, verify=False).ok:
+ feed['icon'] = icon
+
+ if not feed.get('link'):
+ alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
+ type=['application/rss+xml']))
+ if len(alternate) == 1:
+ feed['link'] = rebuild_url(alternate[0].attrs['href'], split)
+ elif len(alternate) > 1:
+ feed['link'] = rebuild_url(alternate[0].attrs['href'], split)
+ feed['other_link'] = [rebuild_url(al.attrs['href'], split)
+ for al in alternate[1:]]
+ return feed
bgstack15