diff options
author | François Schmidts <francois.schmidts@gmail.com> | 2015-07-06 10:28:58 +0200 |
---|---|---|
committer | François Schmidts <francois.schmidts@gmail.com> | 2015-07-06 11:10:33 +0200 |
commit | 75df52051b167425adcfb68797f77fcbcad33c4e (patch) | |
tree | 810b6e887c767f35f11e0412a404352390f7cacc /pyaggr3g470r/lib/article_utils.py | |
parent | making bookmaklet work for any url (diff) | |
download | newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.gz newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.bz2 newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.zip |
light refact
* both crawlers now use the same utils methods
* the original crawler use now more of the controllers, enabling the filters feature
Diffstat (limited to 'pyaggr3g470r/lib/article_utils.py')
-rw-r--r-- | pyaggr3g470r/lib/article_utils.py | 94 |
1 files changed, 94 insertions, 0 deletions
diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py new file mode 100644 index 00000000..023be9a7 --- /dev/null +++ b/pyaggr3g470r/lib/article_utils.py @@ -0,0 +1,94 @@ +import logging +import requests +import dateutil.parser +from datetime import datetime +from bs4 import BeautifulSoup + +import conf +from pyaggr3g470r.lib.utils import to_hash + +logger = logging.getLogger(__name__) + + +def extract_id(entry, keys=[('link', 'link'), + ('published', 'retrieved_date'), + ('updated', 'retrieved_date')], force_id=False): + """For a given entry will return a dict that allows to identify it. The + dict will be constructed on the uid of the entry. if that identifier is + absent, the dict will be constructed upon the values of "keys". + """ + entry_id = entry.get('entry_id') or entry.get('id') + if entry_id: + return {'entry_id': entry_id} + if not entry_id and force_id: + entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys + if entry_key in entry).encode('utf8')) + else: + ids = {} + for entry_key, pyagg_key in keys: + if entry_key in entry and pyagg_key not in ids: + ids[pyagg_key] = entry[entry_key] + if 'date' in pyagg_key: + ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\ + .isoformat() + return ids + + +def construct_article(entry, feed): + "Safe method to transorm a feedparser entry into an article" + now = datetime.now() + + for date_key in ('published', 'updated'): + if entry.get(date_key): + try: + date = dateutil.parser.parse(entry[date_key]) + except Exception: + pass + else: + break + content = '' + if entry.get('content'): + content = entry['content'][0]['value'] + elif entry.get('summary'): + content = entry['summary'] + + description = entry.get('description', '') + try: + description = entry.content[0].value + except Exception: + pass + + try: + soup = BeautifulSoup(description, "lxml") + # Prevents BeautifulSoup4 from adding extra <html><body> tags + # to the soup with the lxml parser. + if soup.html.body: + description = soup.html.body.decode_contents() + elif soup.html: + description = soup.html.decode_contents() + else: + description = soup.decode() + except Exception: + pass + + article_link = entry.get('link') + if conf.RESOLVE_ARTICLE_URL and article_link: + try: + # resolves URL behind proxies + # (like feedproxy.google.com) + response = requests.get(article_link, verify=False, timeout=5.0) + article_link = response.url + except Exception as error: + logger.warning("Unable to get the real URL of %s. Error: %s", + article_link, error) + + return {'feed_id': feed['id'], + 'user_id': feed['user_id'], + 'entry_id': extract_id(entry).get('entry_id', None), + 'link': entry.get('link', feed['site_link']), + 'title': entry.get('title', 'No title'), + 'readed': False, 'like': False, + 'description': description, + 'content': content, + 'retrieved_date': now.isoformat(), + 'date': (date or now).isoformat()} |