aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/lib')
-rw-r--r--pyaggr3g470r/lib/crawler.py7
-rw-r--r--pyaggr3g470r/lib/feed_utils.py7
-rw-r--r--pyaggr3g470r/lib/utils.py2
3 files changed, 13 insertions, 3 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index de557e45..e5998776 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -17,6 +17,7 @@ import conf
import json
import logging
import feedparser
+from datetime import datetime, timedelta
from functools import wraps
from time import strftime, gmtime
from concurrent.futures import ThreadPoolExecutor
@@ -118,7 +119,9 @@ class PyAggUpdater(AbstractCrawler):
results = response.result().json()
logger.debug('%r %r - %d entries were not matched and will be created',
self.feed['id'], self.feed['title'], len(results))
+ article_created = False
for id_to_create in results:
+ article_created = True
entry = construct_article(
self.entries[tuple(sorted(id_to_create.items()))],
self.feed)
@@ -144,6 +147,10 @@ class PyAggUpdater(AbstractCrawler):
if not self.feed.get('title'):
up_feed['title'] = fresh_feed.get('title', '')
up_feed['user_id'] = self.feed['user_id']
+ # re-getting that feed earlier since new entries appeared
+ if article_created:
+ up_feed['last_retrieved'] \
+ = (datetime.now() - timedelta(minutes=45)).isoformat()
logger.info('%r %r - pushing feed attrs %r',
self.feed['id'], self.feed['title'],
diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py
index a7149d79..367fd4b5 100644
--- a/pyaggr3g470r/lib/feed_utils.py
+++ b/pyaggr3g470r/lib/feed_utils.py
@@ -23,6 +23,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
assert url is not None and fp_parsed is not None
feed = feed or {}
feed_split = urllib.parse.urlsplit(url)
+ site_split = None
if not fp_parsed['bozo']:
feed['link'] = url
feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
@@ -82,8 +83,8 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
del feed['icon']
if not feed.get('link'):
- alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
+ alternates = bs_parsed.find_all(check_keys(rel=['alternate'],
type=['application/rss+xml']))
- if len(alternate) >= 1:
- feed['link'] = alternate[0].attrs['href']
+ if len(alternates) >= 1:
+ feed['link'] = rebuild_url(alternates[0].attrs['href'], feed_split)
return feed
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index 280256f6..62284de1 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -42,6 +42,8 @@ def rebuild_url(url, base_split):
def try_splits(url, *splits):
for split in splits:
+ if split is None:
+ continue
rb_url = rebuild_url(url, split)
response = requests.get(rb_url, verify=False, timeout=10)
if response.ok and 'html' not in response.headers['content-type']:
bgstack15