diff options
Diffstat (limited to 'src/web/lib/article_utils.py')
-rw-r--r-- | src/web/lib/article_utils.py | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/src/web/lib/article_utils.py b/src/web/lib/article_utils.py new file mode 100644 index 00000000..02ca2cd1 --- /dev/null +++ b/src/web/lib/article_utils.py @@ -0,0 +1,74 @@ +import logging +import requests +import dateutil.parser +from datetime import datetime + +import conf +from web.lib.utils import to_hash + +logger = logging.getLogger(__name__) + + +def extract_id(entry, keys=[('link', 'link'), ('published', 'date'), + ('updated', 'date')], force_id=False): + """For a given entry will return a dict that allows to identify it. The + dict will be constructed on the uid of the entry. if that identifier is + absent, the dict will be constructed upon the values of "keys". + """ + entry_id = entry.get('entry_id') or entry.get('id') + if entry_id: + return {'entry_id': entry_id} + if not entry_id and force_id: + return to_hash("".join(entry[entry_key] for _, entry_key in keys + if entry_key in entry).encode('utf8')) + else: + ids = {} + for entry_key, pyagg_key in keys: + if entry_key in entry and pyagg_key not in ids: + ids[pyagg_key] = entry[entry_key] + if 'date' in pyagg_key: + ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\ + .isoformat() + return ids + + +def construct_article(entry, feed): + if hasattr(feed, 'dump'): # this way can be a sqlalchemy obj or a dict + feed = feed.dump() + "Safe method to transorm a feedparser entry into an article" + now = datetime.now() + date = None + for date_key in ('published', 'updated'): + if entry.get(date_key): + try: + date = dateutil.parser.parse(entry[date_key]) + except Exception: + pass + else: + break + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + + article_link = entry.get('link') + if conf.RESOLVE_ARTICLE_URL and article_link: + try: + # resolves URL behind proxies + # (like feedproxy.google.com) + response = requests.get(article_link, verify=False, timeout=5.0) + article_link = response.url + except Exception as error: + logger.warning("Unable to get the real URL of %s. Error: %s", + article_link, error) + + return {'feed_id': feed['id'], + 'user_id': feed['user_id'], + 'entry_id': extract_id(entry).get('entry_id', None), + 'link': entry.get('link', feed['site_link']), + 'title': entry.get('title', 'No title'), + 'readed': False, 'like': False, + 'content': content, + 'retrieved_date': now.isoformat(), + 'date': (date or now).isoformat()} |