light refact

* both crawlers now use the same utils methods * the original crawler use now more of the controllers, enabling the filters feature
author: François Schmidts <francois.schmidts@gmail.com> 2015-07-06 10:28:58 +0200
committer: François Schmidts <francois.schmidts@gmail.com> 2015-07-06 11:10:33 +0200
commit: 75df52051b167425adcfb68797f77fcbcad33c4e (patch)
tree: 810b6e887c767f35f11e0412a404352390f7cacc /pyaggr3g470r/lib/article_utils.py
parent: making bookmaklet work for any url (diff)
download: newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.gz
newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.bz2
newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.zip
1 files changed, 94 insertions, 0 deletions
diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py
new file mode 100644
index 00000000..023be9a7
--- /dev/null
+++ b/pyaggr3g470r/lib/article_utils.py
@@ -0,0 +1,94 @@
+import logging
+import requests
+import dateutil.parser
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+import conf
+from pyaggr3g470r.lib.utils import to_hash
+
+logger = logging.getLogger(__name__)
+
+
+def extract_id(entry, keys=[('link', 'link'),
+                            ('published', 'retrieved_date'),
+                            ('updated', 'retrieved_date')], force_id=False):
+    """For a given entry will return a dict that allows to identify it. The
+    dict will be constructed on the uid of the entry. if that identifier is
+    absent, the dict will be constructed upon the values of "keys".
+    """
+    entry_id = entry.get('entry_id') or entry.get('id')
+    if entry_id:
+        return {'entry_id': entry_id}
+    if not entry_id and force_id:
+        entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
+                                   if entry_key in entry).encode('utf8'))
+    else:
+        ids = {}
+        for entry_key, pyagg_key in keys:
+            if entry_key in entry and pyagg_key not in ids:
+                ids[pyagg_key] = entry[entry_key]
+                if 'date' in pyagg_key:
+                    ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\
+                                                    .isoformat()
+        return ids
+
+
+def construct_article(entry, feed):
+    "Safe method to transorm a feedparser entry into an article"
+    now = datetime.now()
+
+    for date_key in ('published', 'updated'):
+        if entry.get(date_key):
+            try:
+                date = dateutil.parser.parse(entry[date_key])
+            except Exception:
+                pass
+            else:
+                break
+    content = ''
+    if entry.get('content'):
+        content = entry['content'][0]['value']
+    elif entry.get('summary'):
+        content = entry['summary']
+
+    description = entry.get('description', '')
+    try:
+        description = entry.content[0].value
+    except Exception:
+        pass
+
+    try:
+        soup = BeautifulSoup(description, "lxml")
+        # Prevents BeautifulSoup4 from adding extra <html><body> tags
+        # to the soup with the lxml parser.
+        if soup.html.body:
+            description = soup.html.body.decode_contents()
+        elif soup.html:
+            description = soup.html.decode_contents()
+        else:
+            description = soup.decode()
+    except Exception:
+        pass
+
+    article_link = entry.get('link')
+    if conf.RESOLVE_ARTICLE_URL and article_link:
+        try:
+            # resolves URL behind proxies
+            # (like feedproxy.google.com)
+            response = requests.get(article_link, verify=False, timeout=5.0)
+            article_link = response.url
+        except Exception as error:
+            logger.warning("Unable to get the real URL of %s. Error: %s",
+                           article_link, error)
+
+    return {'feed_id': feed['id'],
+            'user_id': feed['user_id'],
+            'entry_id': extract_id(entry).get('entry_id', None),
+            'link': entry.get('link', feed['site_link']),
+            'title': entry.get('title', 'No title'),
+            'readed': False, 'like': False,
+            'description': description,
+            'content': content,
+            'retrieved_date': now.isoformat(),
+            'date': (date or now).isoformat()}
author	François Schmidts <francois.schmidts@gmail.com>	2015-07-06 10:28:58 +0200
committer	François Schmidts <francois.schmidts@gmail.com>	2015-07-06 11:10:33 +0200
commit	75df52051b167425adcfb68797f77fcbcad33c4e (patch)
tree	810b6e887c767f35f11e0412a404352390f7cacc /pyaggr3g470r/lib/article_utils.py
parent	making bookmaklet work for any url (diff)
download	newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.gz newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.bz2 newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.zip